This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Consolidate some regex OPS
authorKarl Williamson <public@khwilliamson.com>
Tue, 18 Dec 2012 04:37:40 +0000 (21:37 -0700)
committerKarl Williamson <public@khwilliamson.com>
Sat, 22 Dec 2012 18:11:32 +0000 (11:11 -0700)
The regular rexpression operation POSIXA works on any of the (currently)
16 posix classes (like \w and [:graph:]) under the regex modifier /a.
This commit creates similar operations for the other modifiers: POSIXL
(for /l), POSIXD (for /d), POSIXU (for /u), plus their complements.

It causes these ops to be generated instead of the ALNUM, DIGIT,
HORIZWS, SPACE, and VERTWS ops, as well as all their variants.  The net
saving is 22 regnode types.

The reason to do this is for maintenance.  As of this commit, there are
now 22 fewer node types for which code has to be maintained.  The code
for each variant was essentially the same logic, but on different
operands.  It would be easy to make a change to one copy and forget to
make the corresponding change in the others.  Indeed, this patch fixes
[perl #114272] in which one copy was out of sync with others.

This patch actually reduces the number of separate code paths to 5:
POSIXA, NPOSIXA, POSIXL, POSIXD, and POSIXU.  The complements of the
last 3 use the same code path as their non-complemented version, except
that a variable is initialized differently.  The code then XORs this
variable with its result to do the complementing or not.  Further, the
POSIXD branch now just checks if the target string being matched is
UTF-8 or not, and then jumps to either the POSIXU or POSIXA code
respectively.  So, there are effectively only 4 cases that are coded:
POSIXA, NPOSIXA, POSIXL, and POSIXU.  (POSIXA doesn't have to worry
about UTF-8, while NPOSIXA does, hence these for efficiency are coded
separately.)

Removing all this code saves memory.  The output of the Linux size
command shows that the perl executable was shrunk by 33K bytes on my
platform compiled under -O0 (.7%) and by 18K bytes (1.3%) under -O2.

The reason this patch was doable was previous work in numbering the
POSIX classes, so that they could be indexed in arrays and bit
positions.  This is a large patch; I didn't see how to break it into
smaller components.

I chose to make this code more efficient as opposed to saving even more
memory.  Thus there is a separate loop that is jumped to after we know
we have to load a swash; this just saves having to test if the swash is
loaded each time through the loop.  I avoid loading the swash until
absolutely necessary.  In places in the previous version of this code,
the swash was loaded when the input was UTF-8, even if it wasn't yet
needed (and might never be if the input didn't contain anything above
Latin1); apparently to avoid the extra test per iteration.

The Perl test suite runs slightly faster on my platform with this patch
under -O0, and the speeds are indistinguishable under -O2.  This is in
spite of these new POSIX regops being unknown to the regex optimizer
(this will be addressed in future commits), and extra machine
instructions being required for each character (the xor, and some
shifting and masking).  I expect this is a result of better caching, and
not loading swashes unless absolutely necessary.

embed.fnc
embed.h
handy.h
proto.h
regcomp.c
regcomp.sym
regexec.c
regnodes.h

index 5af5c97..2a5b2b3 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2028,6 +2028,7 @@ Es        |U8     |regtail_study  |NN struct RExC_state_t *pRExC_state \
 
 #if defined(PERL_IN_REGEXEC_C)
 ERs    |bool   |isFOO_lc       |const U8 classnum|const U8 character
+ERs    |bool   |isFOO_utf8_lc  |const U8 classnum|NN const U8* character
 ERs    |I32    |regmatch       |NN regmatch_info *reginfo|NN char *startpos|NN regnode *prog
 ERs    |I32    |regrepeat      |NN const regexp *prog|NN char **startposp|NN const regnode *p|I32 max|int depth
 ERs    |I32    |regtry         |NN regmatch_info *reginfo|NN char **startposp
diff --git a/embed.h b/embed.h
index c1ca676..786892d 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define core_regclass_swash(a,b,c,d)   S_core_regclass_swash(aTHX_ a,b,c,d)
 #define find_byclass(a,b,c,d,e)        S_find_byclass(aTHX_ a,b,c,d,e)
 #define isFOO_lc(a,b)          S_isFOO_lc(aTHX_ a,b)
+#define isFOO_utf8_lc(a,b)     S_isFOO_utf8_lc(aTHX_ a,b)
 #define reg_check_named_buff_matched(a,b)      S_reg_check_named_buff_matched(aTHX_ a,b)
 #define regcppop(a,b)          S_regcppop(aTHX_ a,b)
 #define regcppush(a,b,c)       S_regcppush(aTHX_ a,b,c)
diff --git a/handy.h b/handy.h
index aaeda4a..223324a 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -803,7 +803,7 @@ typedef enum {
 #define POSIX_SWASH_COUNT _FIRST_NON_SWASH_CC
 #define POSIX_CC_COUNT    (_HIGHEST_REGCOMP_DOT_H_SYNC + 1)
 
-#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C)
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
 #   if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
        || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
        || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8
diff --git a/proto.h b/proto.h
index d47e5de..70b2dd4 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6799,6 +6799,12 @@ STATIC char*     S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, cons
 STATIC bool    S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
                        __attribute__warn_unused_result__;
 
+STATIC bool    S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
+       assert(character)
+
 STATIC I32     S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1)
index c5bc8f4..59e4710 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2950,34 +2950,6 @@ typedef struct scan_frame {
 
 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
 
-#define CASE_SYNST_FNC(nAmE)                                       \
-case nAmE:                                                         \
-    if (flags & SCF_DO_STCLASS_AND) {                              \
-           for (value = 0; value < 256; value++)                  \
-               if (!is_ ## nAmE ## _cp(value))                       \
-                   ANYOF_BITMAP_CLEAR(data->start_class, value);  \
-    }                                                              \
-    else {                                                         \
-           for (value = 0; value < 256; value++)                  \
-               if (is_ ## nAmE ## _cp(value))                        \
-                   ANYOF_BITMAP_SET(data->start_class, value);    \
-    }                                                              \
-    break;                                                         \
-case N ## nAmE:                                                    \
-    if (flags & SCF_DO_STCLASS_AND) {                              \
-           for (value = 0; value < 256; value++)                   \
-               if (is_ ## nAmE ## _cp(value))                         \
-                   ANYOF_BITMAP_CLEAR(data->start_class, value);   \
-    }                                                               \
-    else {                                                          \
-           for (value = 0; value < 256; value++)                   \
-               if (!is_ ## nAmE ## _cp(value))                        \
-                   ANYOF_BITMAP_SET(data->start_class, value);     \
-    }                                                               \
-    break
-
-
-
 STATIC I32
 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         I32 *minlenp, I32 *deltap,
@@ -4147,11 +4119,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
            min++;
            if (flags & SCF_DO_STCLASS) {
+                int loop_max = 256;
                data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
 
                /* Some of the logic below assumes that switching
                   locale on will only add false positives. */
                switch (PL_regkind[OP(scan)]) {
+                    U8 classnum;
+
                case SANY:
                default:
                  do_default:
@@ -4178,200 +4153,75 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        cl_or(pRExC_state, data->start_class,
                              (struct regnode_charclass_class*)scan);
                    break;
-               case ALNUM:
+               case POSIXA:
+                    loop_max = 128;
+               case POSIXL:
+               case POSIXD:
+               case POSIXU:
+                    classnum = FLAGS(scan);
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
-                            if (OP(scan) == ALNUMU) {
-                                for (value = 0; value < 256; value++) {
-                                    if (!isWORDCHAR_L1(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
-                                }
-                            } else {
-                                for (value = 0; value < 256; value++) {
-                                    if (!isALNUM(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
+                           ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
+                            for (value = 0; value < loop_max; value++) {
+                                if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
                                 }
                             }
                        }
                    }
                    else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
+                       if (data->start_class->flags & ANYOF_LOCALE) {
+                           ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
+                        }
+                        else {
 
                        /* Even if under locale, set the bits for non-locale
                         * in case it isn't a true locale-node.  This will
                         * create false positives if it truly is locale */
-                        if (OP(scan) == ALNUMU) {
-                            for (value = 0; value < 256; value++) {
-                                if (isWORDCHAR_L1(value)) {
-                                    ANYOF_BITMAP_SET(data->start_class, value);
-                                }
-                            }
-                        } else {
-                            for (value = 0; value < 256; value++) {
-                                if (isALNUM(value)) {
-                                    ANYOF_BITMAP_SET(data->start_class, value);
-                                }
+                        for (value = 0; value < loop_max; value++) {
+                            if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
                             }
                         }
+                        }
                    }
                    break;
-               case NALNUM:
+               case NPOSIXA:
+                    loop_max = 128;
+               case NPOSIXL:
+               case NPOSIXU:
+               case NPOSIXD:
+                    classnum = FLAGS(scan);
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
-                            if (OP(scan) == NALNUMU) {
-                                for (value = 0; value < 256; value++) {
-                                    if (isWORDCHAR_L1(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
+                           ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
+                            for (value = 0; value < loop_max; value++) {
+                                if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
                                 }
-                            } else {
-                                for (value = 0; value < 256; value++) {
-                                    if (isALNUM(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
-                                }
-                           }
+                            }
                        }
                    }
                    else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
+                       if (data->start_class->flags & ANYOF_LOCALE) {
+                           ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
+                        }
+                        else {
 
                        /* Even if under locale, set the bits for non-locale in
                         * case it isn't a true locale-node.  This will create
                         * false positives if it truly is locale */
-                       if (OP(scan) == NALNUMU) {
-                           for (value = 0; value < 256; value++) {
-                               if (! isWORDCHAR_L1(value)) {
-                                   ANYOF_BITMAP_SET(data->start_class, value);
-                               }
-                           }
-                       } else {
-                           for (value = 0; value < 256; value++) {
-                               if (! isALNUM(value)) {
-                                   ANYOF_BITMAP_SET(data->start_class, value);
-                               }
-                           }
-                       }
-                   }
-                   break;
-               case SPACE:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           if (OP(scan) == SPACEU) {
-                                for (value = 0; value < 256; value++) {
-                                    if (!isSPACE_L1(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
-                                }
-                            } else {
-                                for (value = 0; value < 256; value++) {
-                                    if (!isSPACE(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
-                                }
+                        for (value = 0; value < loop_max; value++) {
+                            if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
                             }
-                       }
-                   }
-                   else {
-                        if (data->start_class->flags & ANYOF_LOCALE) {
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
                         }
-                        if (OP(scan) == SPACEU) {
-                            for (value = 0; value < 256; value++) {
-                                if (isSPACE_L1(value)) {
-                                    ANYOF_BITMAP_SET(data->start_class, value);
-                                }
-                            }
-                        } else {
-                            for (value = 0; value < 256; value++) {
-                                if (isSPACE(value)) {
-                                    ANYOF_BITMAP_SET(data->start_class, value);
-                                }
-                            }
-                       }
-                   }
-                   break;
-               case NSPACE:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                           ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                            if (OP(scan) == NSPACEU) {
-                                for (value = 0; value < 256; value++) {
-                                    if (isSPACE_L1(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
-                                }
-                            } else {
-                                for (value = 0; value < 256; value++) {
-                                    if (isSPACE(value)) {
-                                        ANYOF_BITMAP_CLEAR(data->start_class, value);
-                                    }
-                                }
-                            }
-                       }
-                   }
-                   else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                        if (OP(scan) == NSPACEU) {
-                            for (value = 0; value < 256; value++) {
-                                if (!isSPACE_L1(value)) {
-                                    ANYOF_BITMAP_SET(data->start_class, value);
-                                }
-                            }
+                        if (PL_regkind[OP(scan)] == NPOSIXD) {
+                            data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
                         }
-                        else {
-                            for (value = 0; value < 256; value++) {
-                                if (!isSPACE(value)) {
-                                    ANYOF_BITMAP_SET(data->start_class, value);
-                                }
-                            }
                         }
                    }
                    break;
-               case DIGIT:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       if (!(data->start_class->flags & ANYOF_LOCALE)) {
-                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
-                           for (value = 0; value < 256; value++)
-                               if (!isDIGIT(value))
-                                   ANYOF_BITMAP_CLEAR(data->start_class, value);
-                       }
-                   }
-                   else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
-                       for (value = 0; value < 256; value++)
-                           if (isDIGIT(value))
-                               ANYOF_BITMAP_SET(data->start_class, value);
-                   }
-                   break;
-               case NDIGIT:
-                   if (flags & SCF_DO_STCLASS_AND) {
-                       if (!(data->start_class->flags & ANYOF_LOCALE))
-                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
-                       for (value = 0; value < 256; value++)
-                           if (isDIGIT(value))
-                               ANYOF_BITMAP_CLEAR(data->start_class, value);
-                   }
-                   else {
-                       if (data->start_class->flags & ANYOF_LOCALE)
-                           ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
-                       for (value = 0; value < 256; value++)
-                           if (!isDIGIT(value))
-                               ANYOF_BITMAP_SET(data->start_class, value);
-                   }
-                   break;
-               CASE_SYNST_FNC(VERTWS);
-               CASE_SYNST_FNC(HORIZWS);
-
                }
                if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
@@ -6440,7 +6290,7 @@ reStudy:
             r->extflags |= RXf_NULL;
         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
             r->extflags |= RXf_START_ONLY;
-        else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
+        else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
                             && OP(regnext(first)) == END)
             r->extflags |= RXf_WHITE;    
     }
@@ -9553,6 +9403,16 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 ret = reg_node(pRExC_state, OPFAIL);
                 return ret;
             }
+            else if (max == 0) {
+                if (SIZE_ONLY) {
+                    RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
+                }
+                else {
+                    RExC_emit = orig_emit;
+                }
+                ret = reg_node(pRExC_state, NOTHING);
+                return ret;
+            }
 
        do_curly:
            if ((flags&SIMPLE)) {
@@ -10120,9 +9980,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     I32 flags;
     char *parse_start = RExC_parse;
     U8 op;
+    *flagp = WORST;            /* Tentatively. */
+
     GET_RE_DEBUG_FLAGS_DECL;
     DEBUG_PARSE("atom");
-    *flagp = WORST;            /* Tentatively. */
+    int invert = 0;
 
     PERL_ARGS_ASSERT_REGATOM;
 
@@ -10218,6 +10080,7 @@ tryagain:
           literal text handling code.
        */
        switch ((U8)*++RExC_parse) {
+            U8 arg;
        /* Special Escapes */
        case 'A':
            RExC_seen_zerolen++;
@@ -10258,22 +10121,14 @@ tryagain:
            ret = reg_node(pRExC_state, CLUMP);
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
-       case 'w':
-           op = ALNUM + get_regex_charset(RExC_flags);
-            if (op > ALNUMA) {  /* /aa is same as /a */
-                op = ALNUMA;
-            }
-           ret = reg_node(pRExC_state, op);
-           *flagp |= HASWIDTH|SIMPLE;
-           goto finish_meta_pat;
+
        case 'W':
-           op = NALNUM + get_regex_charset(RExC_flags);
-            if (op > NALNUMA) { /* /aa is same as /a */
-                op = NALNUMA;
-            }
-           ret = reg_node(pRExC_state, op);
-           *flagp |= HASWIDTH|SIMPLE;
-           goto finish_meta_pat;
+            invert = 1;
+            /* FALLTHROUGH */
+       case 'w':
+            arg = ANYOF_WORDCHAR;
+            goto join_posix;
+
        case 'b':
            RExC_seen_zerolen++;
            RExC_seen |= REG_SEEN_LOOKBEHIND;
@@ -10296,60 +10151,60 @@ tryagain:
            FLAGS(ret) = get_regex_charset(RExC_flags);
            *flagp |= SIMPLE;
            goto finish_meta_pat;
+
+       case 'S':
+            invert = 1;
+            /* FALLTHROUGH */
        case 's':
-           op = SPACE + get_regex_charset(RExC_flags);
-            if (op > SPACEA) {  /* /aa is same as /a */
-                op = SPACEA;
+            arg = ANYOF_SPACE;
+
+        join_posix:
+
+           op = POSIXD + get_regex_charset(RExC_flags);
+            if (op > POSIXA) {  /* /aa is same as /a */
+                op = POSIXA;
             }
-           ret = reg_node(pRExC_state, op);
-           *flagp |= HASWIDTH|SIMPLE;
-           goto finish_meta_pat;
-       case 'S':
-           op = NSPACE + get_regex_charset(RExC_flags);
-            if (op > NSPACEA) { /* /aa is same as /a */
-                op = NSPACEA;
+
+        join_posix_op_known:
+
+            if (invert) {
+                op += NPOSIXD - POSIXD;
             }
            ret = reg_node(pRExC_state, op);
+            if (! SIZE_ONLY) {
+                FLAGS(ret) = namedclass_to_classnum(arg);
+            }
+
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'D':
-            op = NDIGIT;
-            goto join_D_and_d;
+            invert = 1;
+            /* FALLTHROUGH */
        case 'd':
-            op = DIGIT;
-        join_D_and_d:
-            {
-                U8 offset = get_regex_charset(RExC_flags);
-                if (offset == REGEX_UNICODE_CHARSET) {
-                    offset = REGEX_DEPENDS_CHARSET;
-                }
-                else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
-                    offset = REGEX_ASCII_RESTRICTED_CHARSET;
-                }
-                op += offset;
-            }
-           ret = reg_node(pRExC_state, op);
-           *flagp |= HASWIDTH|SIMPLE;
-           goto finish_meta_pat;
+            arg = ANYOF_DIGIT;
+            goto join_posix;
+
        case 'R':
            ret = reg_node(pRExC_state, LNBREAK);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
-       case 'h':
-           ret = reg_node(pRExC_state, HORIZWS);
-           *flagp |= HASWIDTH|SIMPLE;
-           goto finish_meta_pat;
+
        case 'H':
-           ret = reg_node(pRExC_state, NHORIZWS);
-           *flagp |= HASWIDTH|SIMPLE;
-           goto finish_meta_pat;
-       case 'v':
-           ret = reg_node(pRExC_state, VERTWS);
-           *flagp |= HASWIDTH|SIMPLE;
-           goto finish_meta_pat;
+            invert = 1;
+            /* FALLTHROUGH */
+       case 'h':
+           arg = ANYOF_BLANK;
+            op = POSIXU;
+            goto join_posix_op_known;
+
        case 'V':
-           ret = reg_node(pRExC_state, NVERTWS);
-           *flagp |= HASWIDTH|SIMPLE;
+            invert = 1;
+            /* FALLTHROUGH */
+       case 'v':
+           arg = ANYOF_VERTWS;
+            op = POSIXU;
+            goto join_posix_op_known;
+
          finish_meta_pat:          
            nextchar(pRExC_state);
             Set_Node_Length(ret, 2); /* MJD */
@@ -12314,101 +12169,69 @@ parseit:
         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
                                               [:digit:] or \p{foo} */
 
-            /* Certain named classes have equivalents that can appear outside a
-             * character class, e.g. \w, \H.  We use these instead of a
-             * character class. */
+            /* All named classes are mapped into POSIXish nodes, with its FLAG
+             * argument giving which class it is */
             switch ((I32)namedclass) {
-                U8 offset;
-
-                /* The first group is for node types that depend on the charset
-                 * modifier to the regex.  We first calculate the base node
-                 * type, and if it should be inverted */
-
-                case ANYOF_NWORDCHAR:
-                    invert = ! invert;
-                    /* FALLTHROUGH */
-                case ANYOF_WORDCHAR:
-                    op = ALNUM;
-                    goto join_charset_classes;
-
-                case ANYOF_NSPACE:
-                    invert = ! invert;
-                    /* FALLTHROUGH */
-                case ANYOF_SPACE:
-                    op = SPACE;
-                    goto join_charset_classes;
-
-                case ANYOF_NDIGIT:
-                    invert = ! invert;
-                    /* FALLTHROUGH */
-                case ANYOF_DIGIT:
-                    op = DIGIT;
-
-                  join_charset_classes:
-
-                    /* Now that we have the base node type, we take advantage
-                     * of the enum ordering of the charset modifiers to get the
-                     * exact node type,  For example the base SPACE also has
-                     * SPACEL, SPACEU, and SPACEA */
-
-                    offset = get_regex_charset(RExC_flags);
-
-                    /* /aa is the same as /a for these */
-                    if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
-                        offset = REGEX_ASCII_RESTRICTED_CHARSET;
-                    }
-                    else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
-                        offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
-                    }
-
-                    op += offset;
-
-                    /* The number of varieties of each of these is the same,
-                     * hence, so is the delta between the normal and
-                     * complemented nodes */
-                    if (invert) {
-                        op += NALNUM - ALNUM;
-                    }
-                    *flagp |= HASWIDTH|SIMPLE;
+                case ANYOF_UNIPROP:
                     break;
 
-                /* The second group doesn't depend of the charset modifiers.
-                 * We just have normal and complemented */
+                /* These don't depend on the charset modifiers.  They always
+                 * match under /u rules */
                 case ANYOF_NHORIZWS:
-                    invert = ! invert;
-                    /* FALLTHROUGH */
                 case ANYOF_HORIZWS:
-                  is_horizws:
-                    op = (invert) ? NHORIZWS : HORIZWS;
-                    *flagp |= HASWIDTH|SIMPLE;
-                    break;
+                    namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
+                    /* FALLTHROUGH */
 
                 case ANYOF_NVERTWS:
-                    invert = ! invert;
-                    /* FALLTHROUGH */
                 case ANYOF_VERTWS:
-                    op = (invert) ? NVERTWS : VERTWS;
-                    *flagp |= HASWIDTH|SIMPLE;
-                    break;
-
-                case ANYOF_UNIPROP:
-                    break;
-
-                case ANYOF_NBLANK:
-                    invert = ! invert;
-                    /* FALLTHROUGH */
-                case ANYOF_BLANK:
-                    if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
-                        goto is_horizws;
+                    op = POSIXU;
+                    goto join_posix;
+
+                /* The actual POSIXish node for all the rest depends on the
+                 * charset modifier.  The ones in the first set depend only on
+                 * ASCII or, if available on this platform, locale */
+                case ANYOF_ASCII:
+                case ANYOF_NASCII:
+#ifdef HAS_ISASCII
+                    op = (LOC) ? POSIXL : POSIXA;
+#else
+                    op = POSIXA;
+#endif
+                    goto join_posix;
+
+                case ANYOF_LOWER:
+                case ANYOF_NLOWER:
+                case ANYOF_UPPER:
+                case ANYOF_NUPPER:
+                    /* under /a could be alpha */
+                    if (FOLD) {
+                        if (ASCII_RESTRICTED) {
+                            namedclass = ANYOF_ALPHA + (namedclass % 2);
+                        }
+                        else if (! LOC) {
+                            break;
+                        }
                     }
                     /* FALLTHROUGH */
+
+                /* The rest have more possibilities depending on the charset.  We
+                 * take advantage of the enum ordering of the charset modifiers to
+                 * get the exact node type, */
                 default:
-                    /* A generic posix class.  All the /a ones can be handled
-                     * by the POSIXA opcode.  And all are closed under folding
-                     * in the ASCII range, so FOLD doesn't matter */
-                    if (AT_LEAST_ASCII_RESTRICTED
-                        || (! LOC && namedclass == ANYOF_ASCII))
+                    op = POSIXD + get_regex_charset(RExC_flags);
+                    if (op > POSIXA) { /* /aa is same as /a */
+                        op = POSIXA;
+                    }
+#ifndef HAS_ISBLANK
+                    if (op == POSIXL
+                        && (namedclass == ANYOF_BLANK
+                            || namedclass == ANYOF_NBLANK))
                     {
+                        op = POSIXA;
+                    }
+#endif
+
+                join_posix:
                         /* The odd numbered ones are the complements of the
                          * next-lower even number one */
                         if (namedclass % 2 == 1) {
@@ -12416,8 +12239,6 @@ parseit:
                             namedclass--;
                         }
                         arg = namedclass_to_classnum(namedclass);
-                        op = (invert) ? NPOSIXA : POSIXA;
-                    }
                     break;
             }
         }
@@ -12442,8 +12263,8 @@ parseit:
         else if (! LOC) {   /* locale could vary these */
             if (prevvalue == '0') {
                 if (value == '9') {
-                    op = (invert) ? NDIGITA : DIGITA;
-                    *flagp |= HASWIDTH|SIMPLE;
+                    arg = _CC_DIGIT;
+                    op = POSIXA;
                 }
             }
         }
@@ -12469,6 +12290,11 @@ parseit:
             }
             else {
                 RExC_emit = (regnode *)orig_emit;
+                if (PL_regkind[op] == POSIXD) {
+                    if (invert) {
+                        op += NPOSIXD - POSIXD;
+                    }
+                }
             }
 
             ret = reg_node(pRExC_state, op);
index eb8ba46..2a49d20 100644 (file)
@@ -36,8 +36,7 @@ SEOL        EOL,        no        ; Same, assuming singleline.
 # modifiers have to ordered thusly: /d, /l, /u, /a, /aa.  This is because code
 # in regcomp.c uses the enum value of the modifier as an offset from the /d
 # version.  The complements must come after the non-complements.
-# BOUND, ALNUM, SPACE, DIGIT, and their complements are affected, as well as
-# EXACTF.
+# BOUND, POSIX and their complements are affected, as well as EXACTF.
 BOUND       BOUND,      no        ; Match "" at any word boundary using native charset semantics for non-utf8
 BOUNDL      BOUND,      no        ; Match "" at any locale word boundary
 BOUNDU      BOUND,      no        ; Match "" at any word boundary using Unicode semantics
@@ -56,44 +55,16 @@ SANY        REG_ANY,    no 0 S    ; Match any one character.
 CANY        REG_ANY,    no 0 S    ; Match any one byte.
 ANYOF       ANYOF,      sv 0 S    ; Match character in (or not in) this class, single char match only
 
-# Order (within each group) of the below is important.  See ordering comment
-# above.  The PLACEHOLDERn ones are wasting a value.  Right now, we have plenty
-# to spare, but these would be obvious candidates if ever we ran out of node
-# types in a U8.
-ALNUM       ALNUM,      no 0 S    ; Match any alphanumeric character using native charset semantics for non-utf8
-ALNUML      ALNUM,      no 0 S    ; Match any alphanumeric char in locale
-ALNUMU      ALNUM,      no 0 S    ; Match any alphanumeric char using Unicode semantics
-ALNUMA      ALNUM,      no 0 S    ; Match [A-Za-z_0-9]
-NALNUM      NALNUM,     no 0 S    ; Match any non-alphanumeric character using native charset semantics for non-utf8
-NALNUML     NALNUM,     no 0 S    ; Match any non-alphanumeric char in locale
-NALNUMU     NALNUM,     no 0 S    ; Match any non-alphanumeric char using Unicode semantics
-NALNUMA     NALNUM,     no 0 S    ; Match [^A-Za-z_0-9]
-SPACE       SPACE,      no 0 S    ; Match any whitespace character using native charset semantics for non-utf8
-SPACEL      SPACE,      no 0 S    ; Match any whitespace char in locale
-SPACEU      SPACE,      no 0 S    ; Match any whitespace char using Unicode semantics
-SPACEA      SPACE,      no 0 S    ; Match [ \t\n\f\r]
-NSPACE      NSPACE,     no 0 S    ; Match any non-whitespace character using native charset semantics for non-utf8
-NSPACEL     NSPACE,     no 0 S    ; Match any non-whitespace char in locale
-NSPACEU     NSPACE,     no 0 S    ; Match any non-whitespace char using Unicode semantics
-NSPACEA     NSPACE,     no 0 S    ; Match [^ \t\n\f\r]
-DIGIT       DIGIT,      no 0 S    ; Match any numeric character using native charset semantics for non-utf8
-DIGITL      DIGIT,      no 0 S    ; Match any numeric character in locale
-PLACEHOLDER1 NOTHING,   no        ; placeholder for missing DIGITU
-DIGITA      DIGIT,      no 0 S    ; Match [0-9]
-NDIGIT      NDIGIT,     no 0 S    ; Match any non-numeric character using native charset semantics for non-utf8
-NDIGITL     NDIGIT,     no 0 S    ; Match any non-numeric character in locale
-PLACEHOLDER2 NOTHING,   no        ; placeholder for missing NDIGITU
-NDIGITA     NDIGIT,     no 0 S    ; Match [^0-9]
-
-POSIXD      POSIXD,     none 0 S   ; currently unused except as a placeholder
-POSIXL      POSIXD,     none 0 S   ; currently unused except as a placeholder
-POSIXU      POSIXD,     none 0 S   ; currently unused except as a placeholder
+# Order of the below is important.  See ordering comment above.
+POSIXD      POSIXD,     none 0 S   ; Some [[:class:]] under /d; the FLAGS field gives which one
+POSIXL      POSIXD,     none 0 S   ; Some [[:class:]] under /l; the FLAGS field gives which one
+POSIXU      POSIXD,     none 0 S   ; Some [[:class:]] under /u; the FLAGS field gives which one
 POSIXA      POSIXD,     none 0 S   ; Some [[:class:]] under /a; the FLAGS field gives which one
-NPOSIXD     NPOSIXD,    none 0 S   ; currently unused except as a placeholder
-NPOSIXL     NPOSIXD,    none 0 S   ; currently unused except as a placeholder
-NPOSIXU     NPOSIXD,    none 0 S   ; currently unused except as a placeholder
+NPOSIXD     NPOSIXD,    none 0 S   ; complement of POSIXD, [[:^class:]]
+NPOSIXL     NPOSIXD,    none 0 S   ; complement of POSIXL, [[:^class:]]
+NPOSIXU     NPOSIXD,    none 0 S   ; complement of POSIXU, [[:^class:]]
 NPOSIXA     NPOSIXD,    none 0 S   ; complement of POSIXA, [[:^class:]]
-# End of order is important (within groups)
+# End of order is important
 
 CLUMP       CLUMP,      no 0 V    ; Match any extended grapheme cluster sequence
 
@@ -237,13 +208,6 @@ KEEPS       KEEPS,      no        ; $& begins here.
 #*New charclass like patterns
 LNBREAK     LNBREAK,    none      ; generic newline pattern
 
-# regcomp.c expects the node number of the complement to be one greater than
-# the non-complement
-VERTWS      VERTWS,     none 0 S  ; vertical whitespace         (Perl 6)
-NVERTWS     NVERTWS,    none 0 S  ; not vertical whitespace     (Perl 6)
-HORIZWS     HORIZWS,    none 0 S  ; horizontal whitespace       (Perl 6)
-NHORIZWS    NHORIZWS,   none 0 S  ; not horizontal whitespace   (Perl 6)
-
 # NEW STUFF SOMEWHERE ABOVE THIS LINE
 
 ################################################################################
index 7d03f09..31a25fb 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -174,101 +174,6 @@ static const char* const non_utf8_target_but_utf8_required
 
 #define PLACEHOLDER    /* Something for the preprocessor to grab onto */
 
-/* The actual code for CCC_TRY, which uses several variables from the routine
- * it's callable from.  It is designed to be the bulk of a case statement.
- * FUNC is the macro or function to call on non-utf8 targets that indicate if
- *      nextchr matches the class.
- * UTF8_TEST is the whole test string to use for utf8 targets
- * LOAD is what to use to test, and if not present to load in the swash for the
- *     class
- * POS_OR_NEG is either empty or ! to complement the results of FUNC or
- *     UTF8_TEST test.
- * The logic is: Fail if we're at the end-of-string; otherwise if the target is
- * utf8 and a variant, load the swash if necessary and test using the utf8
- * test.  Advance to the next character if test is ok, otherwise fail; If not
- * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
- * fails, or advance to the next character */
-
-#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
-    if (NEXTCHR_IS_EOS) {                                                     \
-       sayNO;                                                                \
-    }                                                                         \
-    if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
-       LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
-       if (POS_OR_NEG (UTF8_TEST)) {                                         \
-           sayNO;                                                            \
-       }                                                                     \
-    }                                                                         \
-    else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
-            sayNO;                                                            \
-    }                                                                         \
-    goto increment_locinput;
-
-/* Handle the non-locale cases for a character class and its complement.  It
- * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
- * This is because that code fails when the test succeeds, so we want to have
- * the test fail so that the code succeeds.  The swash is stored in a
- * predictable PL_ place */
-#define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
-                          CLASS, STR)                                        \
-    case NAME:                                                                \
-       _CCC_TRY_CODE( !, FUNC,                                               \
-                         cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
-                                           (U8*)locinput, TRUE)),            \
-                         CLASS, STR)                                         \
-    case NNAME:                                                               \
-       _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
-                         cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
-                                           (U8*)locinput, TRUE)),            \
-                         CLASS, STR)
-/* Generate the case statements for both locale and non-locale character
- * classes in regmatch for classes that don't have special unicode semantics.
- * Locales don't use an immediate swash, but an intermediary special locale
- * function that is called on the pointer to the current place in the input
- * string.  That function will resolve to needing the same swash.  One might
- * think that because we don't know what the locale will match, we shouldn't
- * check with the swash loading function that it loaded properly; ie, that we
- * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
- * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
- * irrelevant here */
-#define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
-               NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
-               NAMEA, NNAMEA, FUNCA,                                         \
-               CLASS, STR)                                                   \
-    case NAMEL:                                                               \
-       PL_reg_flags |= RF_tainted;                                           \
-       _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
-    case NNAMEL:                                                              \
-       PL_reg_flags |= RF_tainted;                                           \
-       _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
-                      CLASS, STR)                                            \
-    case NAMEA:                                                               \
-       if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
-           sayNO;                                                            \
-       }                                                                     \
-       /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
-       locinput++;                                        \
-       break;                                                                \
-    case NNAMEA:                                                              \
-       if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
-           sayNO;                                                            \
-       }                                                                     \
-        goto increment_locinput;                                              \
-    /* Generate the non-locale cases */                                       \
-    _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
-
-/* This is like CCC_TRY, but has an extra set of parameters for generating case
- * statements to handle separate Unicode semantics nodes */
-#define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
-                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
-                 NAMEU, NNAMEU, FUNCU,                                        \
-                 NAMEA, NNAMEA, FUNCA,                                        \
-                 CLASS, STR)                                                  \
-    CCC_TRY(NAME, NNAME, FUNC,                                                 \
-           NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
-           NAMEA, NNAMEA, FUNCA,                                              \
-           CLASS, STR)                                                        \
-    _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 
 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 
@@ -549,6 +454,56 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
     return FALSE;
 }
 
+STATIC bool
+S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
+{
+    /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
+     * 'character' is a member of the Posix character class given by 'classnum'
+     * that should be equivalent to a value in the typedef
+     * '_char_class_number'.
+     *
+     * This just calls isFOO_lc on the code point for the character if it is in
+     * the range 0-255.  Outside that range, all characters avoid Unicode
+     * rules, ignoring any locale.  So use the Unicode function if this class
+     * requires a swash, and use the Unicode macro otherwise. */
+
+    PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
+
+    if (UTF8_IS_INVARIANT(*character)) {
+        return isFOO_lc(classnum, *character);
+    }
+    else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
+        return isFOO_lc(classnum,
+                        TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
+    }
+
+    if (classnum < _FIRST_NON_SWASH_CC) {
+
+        /* Initialize the swash unless done already */
+        if (! PL_utf8_swash_ptrs[classnum]) {
+            U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+            PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
+                swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
+        }
+
+        return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE);
+    }
+
+    switch ((_char_class_number) classnum) {
+        case _CC_ENUM_SPACE:
+        case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
+
+        case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
+        case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
+        case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
+        default:                 return 0;  /* Things like CNTRL are always
+                                               below 256 */
+    }
+
+    assert(0); /* NOTREACHED */
+    return FALSE;
+}
+
 /*
  * pregexec and friends
  */
@@ -1498,13 +1453,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     const U8 *fold_array;   /* array for folding ords < 256 */
     STRLEN ln;
     STRLEN lnc;
-    STRLEN uskip;
     U8 c1;
     U8 c2;
     char *e;
     I32 tmp = 1;       /* Scratch variable? */
     const bool utf8_target = PL_reg_match_utf8;
     UV utf8_fold_flags = 0;
+    bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
+                                   with a result inverts that result, as 0^1 =
+                                   1 and 1^1 = 0 */
+    _char_class_number classnum;
+
     RXi_GET_DECL(prog,progi);
 
     PERL_ARGS_ASSERT_FIND_BYCLASS;
@@ -1710,182 +1669,155 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                    isALNUM_uni(tmp),
                    cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
         break;
-    case ALNUML:
-        REXEC_FBC_CSCAN_TAINT(
-            isALNUM_LC_utf8((U8*)s),
-            isALNUM_LC(*s)
-        );
-        break;
-    case ALNUMU:
-        REXEC_FBC_CSCAN_PRELOAD(
-            LOAD_UTF8_CHARCLASS_ALNUM(),
-            swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
-            isWORDCHAR_L1((U8) *s)
-        );
-        break;
-    case ALNUM:
-        REXEC_FBC_CSCAN_PRELOAD(
-            LOAD_UTF8_CHARCLASS_ALNUM(),
-            swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
-            isWORDCHAR((U8) *s)
-        );
-        break;
-    case ALNUMA:
-        /* Don't need to worry about utf8, as it can match only a single
-         * byte invariant character */
-        REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
-        break;
-    case NALNUMU:
-        REXEC_FBC_CSCAN_PRELOAD(
-            LOAD_UTF8_CHARCLASS_ALNUM(),
-            !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
-            ! isWORDCHAR_L1((U8) *s)
-        );
-        break;
-    case NALNUM:
-        REXEC_FBC_CSCAN_PRELOAD(
-            LOAD_UTF8_CHARCLASS_ALNUM(),
-            !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
-            ! isALNUM(*s)
-        );
-        break;
-    case NALNUMA:
-        REXEC_FBC_CSCAN(
-            !isWORDCHAR_A(*s),
-            !isWORDCHAR_A(*s)
-        );
-        break;
-    case NALNUML:
-        REXEC_FBC_CSCAN_TAINT(
-            !isALNUM_LC_utf8((U8*)s),
-            !isALNUM_LC(*s)
-        );
-        break;
-    case SPACEU:
-        REXEC_FBC_CSCAN(
-            is_XPERLSPACE_utf8(s),
-            isSPACE_L1((U8) *s)
-        );
-        break;
-    case SPACE:
-        REXEC_FBC_CSCAN(
-            is_XPERLSPACE_utf8(s),
-            isSPACE((U8) *s)
-        );
-        break;
-    case SPACEA:
-        /* Don't need to worry about utf8, as it can match only a single
-         * byte invariant character */
-        REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
-        break;
-    case SPACEL:
-        REXEC_FBC_CSCAN_TAINT(
-            isSPACE_LC_utf8((U8*)s),
-            isSPACE_LC(*s)
-        );
-        break;
-    case NSPACEU:
-        REXEC_FBC_CSCAN(
-            ! is_XPERLSPACE_utf8(s),
-            ! isSPACE_L1((U8) *s)
-        );
-        break;
-    case NSPACE:
-        REXEC_FBC_CSCAN(
-            ! is_XPERLSPACE_utf8(s),
-            ! isSPACE((U8) *s)
-        );
-        break;
-    case NSPACEA:
-        REXEC_FBC_CSCAN(
-            !isSPACE_A(*s),
-            !isSPACE_A(*s)
-        );
-        break;
-    case NSPACEL:
-        REXEC_FBC_CSCAN_TAINT(
-            !isSPACE_LC_utf8((U8*)s),
-            !isSPACE_LC(*s)
-        );
-        break;
-    case DIGIT:
-        REXEC_FBC_CSCAN_PRELOAD(
-            LOAD_UTF8_CHARCLASS_DIGIT(),
-            swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
-            isDIGIT(*s)
-        );
-        break;
-    case DIGITA:
-        /* Don't need to worry about utf8, as it can match only a single
-         * byte invariant character */
-        REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
-        break;
-    case DIGITL:
-        REXEC_FBC_CSCAN_TAINT(
-            isDIGIT_LC_utf8((U8*)s),
-            isDIGIT_LC(*s)
-        );
-        break;
-    case NDIGIT:
-        REXEC_FBC_CSCAN_PRELOAD(
-            LOAD_UTF8_CHARCLASS_DIGIT(),
-            !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
-            !isDIGIT(*s)
-        );
-        break;
-    case NDIGITA:
-        REXEC_FBC_CSCAN(
-            !isDIGIT_A(*s),
-            !isDIGIT_A(*s)
-        );
-        break;
-    case NDIGITL:
-        REXEC_FBC_CSCAN_TAINT(
-            !isDIGIT_LC_utf8((U8*)s),
-            !isDIGIT_LC(*s)
-        );
-        break;
     case LNBREAK:
         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
                         is_LNBREAK_latin1_safe(s, strend)
         );
         break;
-    case VERTWS:
-        REXEC_FBC_CSCAN(
-            is_VERTWS_utf8_safe(s, strend),
-            is_VERTWS_latin1_safe(s, strend)
-        );
-        break;
-    case NVERTWS:
-        REXEC_FBC_CSCAN(
-            !is_VERTWS_utf8_safe(s, strend),
-            !is_VERTWS_latin1_safe(s, strend)
-        );
-        break;
-    case HORIZWS:
-        REXEC_FBC_CSCAN(
-            is_HORIZWS_utf8_safe(s, strend),
-            is_HORIZWS_latin1_safe(s, strend)
-        );
-        break;
-    case NHORIZWS:
-        REXEC_FBC_CSCAN(
-            !is_HORIZWS_utf8_safe(s, strend),
-            !is_HORIZWS_latin1_safe(s, strend)
-        );
+
+    /* The argument to all the POSIX node types is the class number to pass to
+     * _generic_isCC() to build a mask for searching in PL_charclass[] */
+
+    case NPOSIXL:
+        to_complement = 1;
+        /* FALLTHROUGH */
+
+    case POSIXL:
+        PL_reg_flags |= RF_tainted;
+        REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
+                        to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
         break;
+
+    case NPOSIXD:
+        to_complement = 1;
+        /* FALLTHROUGH */
+
+    case POSIXD:
+        if (utf8_target) {
+            goto posix_utf8;
+        }
+        goto posixa;
+
+    case NPOSIXA:
+        if (utf8_target) {
+            /* The complement of something that matches only ASCII matches all
+             * UTF-8 variant code points, plus everything in ASCII that isn't
+             * in the class */
+            REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
+                                      || ! _generic_isCC_A(*s, FLAGS(c)));
+            break;
+        }
+
+        to_complement = 1;
+        /* FALLTHROUGH */
+
     case POSIXA:
+      posixa:
         /* Don't need to worry about utf8, as it can match only a single
-        * byte invariant character.  The flag in this node type is the
-        * class number to pass to _generic_isCC() to build a mask for
-        * searching in PL_charclass[] */
-        REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
+         * byte invariant character. */
+        REXEC_FBC_CLASS_SCAN(
+                        to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
         break;
-    case NPOSIXA:
-        REXEC_FBC_CSCAN(
-            !_generic_isCC_A(*s, FLAGS(c)),
-            !_generic_isCC_A(*s, FLAGS(c))
-        );
+
+    case NPOSIXU:
+        to_complement = 1;
+        /* FALLTHROUGH */
+
+    case POSIXU:
+        if (! utf8_target) {
+            REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
+                                                                    FLAGS(c))));
+        }
+        else {
+
+      posix_utf8:
+            classnum = (_char_class_number) FLAGS(c);
+            if (classnum < _FIRST_NON_SWASH_CC) {
+                while (s < strend) {
+
+                    /* We avoid loading in the swash as long as possible, but
+                     * should we have to, we jump to a separate loop.  This
+                     * extra 'if' statement is what keeps this code from being
+                     * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
+                    if (UTF8_IS_ABOVE_LATIN1(*s)) {
+                        goto found_above_latin1;
+                    }
+                    if ((UTF8_IS_INVARIANT(*s)
+                         && to_complement ^ cBOOL(_generic_isCC((U8) *s,
+                                                                classnum)))
+                        || (UTF8_IS_DOWNGRADEABLE_START(*s)
+                            && to_complement ^ cBOOL(
+                                _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
+                                              classnum))))
+                    {
+                        if (tmp && (!reginfo || regtry(reginfo, &s)))
+                            goto got_it;
+                        else {
+                            tmp = doevery;
+                        }
+                    }
+                    else {
+                        tmp = 1;
+                    }
+                    s += UTF8SKIP(s);
+                }
+            }
+            else switch (classnum) {    /* These classes are implemented as
+                                           macros */
+                case _CC_ENUM_SPACE: /* XXX would require separate code if we
+                                        revert the change of \v matching this */
+                    /* FALL THROUGH */
+
+                case _CC_ENUM_PSXSPC:
+                    REXEC_FBC_UTF8_CLASS_SCAN(
+                                        to_complement ^ cBOOL(isSPACE_utf8(s)));
+                    break;
+
+                case _CC_ENUM_BLANK:
+                    REXEC_FBC_UTF8_CLASS_SCAN(
+                                        to_complement ^ cBOOL(isBLANK_utf8(s)));
+                    break;
+
+                case _CC_ENUM_XDIGIT:
+                    REXEC_FBC_UTF8_CLASS_SCAN(
+                                       to_complement ^ cBOOL(isXDIGIT_utf8(s)));
+                    break;
+
+                case _CC_ENUM_VERTSPACE:
+                    REXEC_FBC_UTF8_CLASS_SCAN(
+                                       to_complement ^ cBOOL(isVERTWS_utf8(s)));
+                    break;
+
+                case _CC_ENUM_CNTRL:
+                    REXEC_FBC_UTF8_CLASS_SCAN(
+                                        to_complement ^ cBOOL(isCNTRL_utf8(s)));
+                    break;
+
+                default:
+                    Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
+                    assert(0); /* NOTREACHED */
+            }
+        }
+        break;
+
+      found_above_latin1:   /* Here we have to load a swash to get the result
+                               for the current code point */
+        if (! PL_utf8_swash_ptrs[classnum]) {
+            U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+            PL_utf8_swash_ptrs[classnum] =
+                    _core_swash_init("utf8", swash_property_names[classnum],
+                                     &PL_sv_undef, 1, 0, NULL, &flags);
+        }
+
+        /* This is a copy of the loop above for swash classes, though using the
+         * FBC macro instead of being expanded out.  Since we've loaded the
+         * swash, we don't have to check for that each time through the loop */
+        REXEC_FBC_UTF8_CLASS_SCAN(
+                to_complement ^ cBOOL(_generic_utf8(
+                                      classnum,
+                                      s,
+                                      swash_fetch(PL_utf8_swash_ptrs[classnum],
+                                                  (U8 *) s, TRUE))));
         break;
 
     case AHOCORASICKC:
@@ -3636,6 +3568,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
     CHECKPOINT runops_cp;      /* savestack position before executing EVAL */
     U32 maxopenparen = 0;       /* max '(' index seen so far */
+    int to_complement;  /* Invert the result? */
+    _char_class_number classnum;
 
 #ifdef DEBUGGING
     GET_RE_DEBUG_FLAGS_DECL;
@@ -3697,6 +3631,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
        state_num = OP(scan);
 
       reenter_switch:
+        to_complement = 0;
 
         SET_nextchr;
         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
@@ -4362,100 +4297,184 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            }
            break;
 
-       /* Special char classes: \d, \w etc.
-         * The defines start on line 166 or so */
-        CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
-                 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
-                 ALNUMU, NALNUMU, isWORDCHAR_L1,
-                 ALNUMA, NALNUMA, isWORDCHAR_A,
-                 alnum, "a");
+        /* The argument (FLAGS) to all the POSIX node types is the class number
+         * */
 
-        case SPACEL:
-            PL_reg_flags |= RF_tainted;
-            if (NEXTCHR_IS_EOS) {
+        case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
+            to_complement = 1;
+            /* FALLTHROUGH */
+
+        case POSIXL:    /* \w or [:punct:] etc. under /l */
+            if (NEXTCHR_IS_EOS)
                 sayNO;
-            }
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
-                if (! isSPACE_LC_utf8((U8 *) locinput)) {
-                    sayNO;
-                }
-            }
-            else if (! isSPACE_LC((U8) nextchr)) {
-                    sayNO;
-            }
-            goto increment_locinput;
 
-        case NSPACEL:
+            /* The locale hasn't influenced the outcome before this, so defer
+             * tainting until now */
             PL_reg_flags |= RF_tainted;
-            if (NEXTCHR_IS_EOS) {
-                sayNO;
-            }
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
-                if (isSPACE_LC_utf8((U8 *) locinput)) {
+
+            /* Use isFOO_lc() for characters within Latin1.  (Note that
+             * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
+             * wouldn't be invariant) */
+            if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
+                if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), nextchr)))) {
                     sayNO;
                 }
             }
-            else if (isSPACE_LC(nextchr)) {
+            else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+                if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
+                                        TWO_BYTE_UTF8_TO_UNI(nextchr,
+                                                            *(locinput + 1))))))
+                {
                     sayNO;
+                }
             }
-            goto increment_locinput;
-
-        case SPACE:
-            if (utf8_target) {
-                goto utf8_space;
+            else { /* Here, must be an above Latin-1 code point */
+                goto utf8_posix_not_eos;
             }
-            /* FALL THROUGH */
-        case SPACEA:
-            if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
-                sayNO;
-            }
-            /* Matched a utf8-invariant, so don't have to worry about utf8 */
-            locinput++;
+
+            /* Here, must be utf8 */
+            locinput += UTF8SKIP(locinput);
             break;
 
-        case NSPACE:
+        case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
+            to_complement = 1;
+            /* FALLTHROUGH */
+
+        case POSIXD:    /* \w or [:punct:] etc. under /d */
             if (utf8_target) {
-                goto utf8_nspace;
-            }
-            /* FALL THROUGH */
-        case NSPACEA:
-            if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
-                sayNO;
+                goto utf8_posix;
             }
-            goto increment_locinput;
+            goto posixa;
+
+        case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
 
-        case SPACEU:
-          utf8_space:
-            if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
+            if (NEXTCHR_IS_EOS) {
                 sayNO;
             }
-            goto increment_locinput;
 
-        case NSPACEU:
-          utf8_nspace:
-            if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
-                sayNO;
+            /* All UTF-8 variants match */
+            if (! UTF8_IS_INVARIANT(nextchr)) {
+                goto increment_locinput;
             }
-            goto increment_locinput;
 
-        CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
-               DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
-               DIGITA, NDIGITA, isDIGIT_A,
-               digit, "0");
+            to_complement = 1;
+            /* FALLTHROUGH */
+
+        case POSIXA:    /* \w or [:punct:] etc. under /a */
+
+          posixa:
+            /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
+             * UTF-8, and also from NPOSIXA even in UTF-8 when the current
+             * character is a single byte */
 
-        case POSIXA: /* /[[:ascii:]]/ etc */
-            if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+            if (NEXTCHR_IS_EOS
+                || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
+                                                            FLAGS(scan)))))
+            {
                 sayNO;
             }
-            /* Matched a utf8-invariant, so don't have to worry about utf8 */
+
+            /* Here we are either not in utf8, or we matched a utf8-invariant,
+             * so the next char is the next byte */
             locinput++;
             break;
 
-        case NPOSIXA: /*  /[^[:ascii:]]/  etc */
-            if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
+        case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
+            to_complement = 1;
+            /* FALLTHROUGH */
+
+        case POSIXU:    /* \w or [:punct:] etc. under /u */
+          utf8_posix:
+            if (NEXTCHR_IS_EOS) {
                 sayNO;
             }
-            goto increment_locinput;
+          utf8_posix_not_eos:
+
+            /* Use _generic_isCC() for characters within Latin1.  (Note that
+             * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
+             * wouldn't be invariant) */
+            if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
+                if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
+                                                           FLAGS(scan)))))
+                {
+                    sayNO;
+                }
+                locinput++;
+            }
+            else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+                if (! (to_complement
+                       ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
+                                                               *(locinput + 1)),
+                                              FLAGS(scan)))))
+                {
+                    sayNO;
+                }
+                locinput += 2;
+            }
+            else {  /* Handle above Latin-1 code points */
+                classnum = (_char_class_number) FLAGS(scan);
+                if (classnum < _FIRST_NON_SWASH_CC) {
+
+                    /* Here, uses a swash to find such code points.  Load if if
+                     * not done already */
+                    if (! PL_utf8_swash_ptrs[classnum]) {
+                        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+                        PL_utf8_swash_ptrs[classnum]
+                                = _core_swash_init("utf8",
+                                        swash_property_names[classnum],
+                                        &PL_sv_undef, 1, 0, NULL, &flags);
+                    }
+                    if (! (to_complement
+                           ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
+                                               (U8 *) locinput, TRUE))))
+                    {
+                        sayNO;
+                    }
+                }
+                else {  /* Here, uses macros to find above Latin-1 code points */
+                    switch (classnum) {
+                        case _CC_ENUM_SPACE:    /* XXX would require separate
+                                                   code if we revert the change
+                                                   of \v matching this */
+                        case _CC_ENUM_PSXSPC:
+                            if (! (to_complement
+                                        ^ cBOOL(is_XPERLSPACE_high(locinput))))
+                            {
+                                sayNO;
+                            }
+                            break;
+                        case _CC_ENUM_BLANK:
+                            if (! (to_complement
+                                            ^ cBOOL(is_HORIZWS_high(locinput))))
+                            {
+                                sayNO;
+                            }
+                            break;
+                        case _CC_ENUM_XDIGIT:
+                            if (! (to_complement
+                                            ^ cBOOL(is_XDIGIT_high(locinput))))
+                            {
+                                sayNO;
+                            }
+                            break;
+                        case _CC_ENUM_VERTSPACE:
+                            if (! (to_complement
+                                            ^ cBOOL(is_VERTWS_high(locinput))))
+                            {
+                                sayNO;
+                            }
+                            break;
+                        default:    /* The rest, e.g. [:cntrl:], can't match
+                                       above Latin1 */
+                            if (! to_complement) {
+                                sayNO;
+                            }
+                            break;
+                    }
+                }
+                locinput += UTF8SKIP(locinput);
+            }
+            break;
 
        case CLUMP: /* Match \X: logical Unicode character.  This is defined as
                       a Unicode extended Grapheme Cluster */
@@ -6417,29 +6436,6 @@ NULL
                 sayNO;
             break;
 
-#define CASE_CLASS(nAmE)                              \
-        case nAmE:                                    \
-           if (NEXTCHR_IS_EOS)                       \
-               sayNO;                                \
-            if ((n=is_##nAmE(locinput,utf8_target))) {    \
-                locinput += n;                        \
-            } else                                    \
-                sayNO;                                \
-            break;                                    \
-        case N##nAmE:                                 \
-           if (NEXTCHR_IS_EOS)                       \
-               sayNO;                                \
-            if ((n=is_##nAmE(locinput,utf8_target))) {    \
-                sayNO;                                \
-            } else {                                  \
-                locinput += UTF8SKIP(locinput);       \
-            }                                         \
-            break
-
-        CASE_CLASS(VERTWS);  /*  \v \V  */
-        CASE_CLASS(HORIZWS); /*  \h \H  */
-#undef CASE_CLASS
-
        default:
            PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
                          PTR2UV(scan), OP(scan));
@@ -6665,7 +6661,9 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     char *loceol = PL_regeol;   /* local version */
     I32 hardcount = 0;  /* How many matches so far */
     bool utf8_target = PL_reg_match_utf8;
+    int to_complement = 0;  /* Invert the result? */
     UV utf8_flags;
+    _char_class_number classnum;
 #ifndef DEBUGGING
     PERL_UNUSED_ARG(depth);
 #endif
@@ -6887,79 +6885,38 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
                scan++;
        }
        break;
-    case ALNUMU:
-       if (utf8_target) {
-    utf8_wordchar:
-           LOAD_UTF8_CHARCLASS_ALNUM();
-           while (hardcount < max && scan < loceol &&
-                   swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
-            {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-        } else {
-            while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
-                scan++;
-            }
-       }
-       break;
-    case ALNUM:
-       if (utf8_target)
-           goto utf8_wordchar;
-       while (scan < loceol && isALNUM((U8) *scan)) {
-           scan++;
-       }
-       break;
-    case ALNUMA:
-        if (utf8_target && scan + max < loceol) {
 
-            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
-             * since here, to match, 1 char == 1 byte */
-            loceol = scan + max;
-        }
-       while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
-           scan++;
-       }
-       break;
-    case ALNUML:
-       PL_reg_flags |= RF_tainted;
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                  isALNUM_LC_utf8((U8*)scan)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && isALNUM_LC(*scan))
-               scan++;
-       }
-       break;
-    case NALNUMU:
-       if (utf8_target) {
+    /* The argument (FLAGS) to all the POSIX node types is the class number */
 
-    utf8_Nwordchar:
+    case NPOSIXL:
+        to_complement = 1;
+        /* FALLTHROUGH */
 
-           LOAD_UTF8_CHARCLASS_ALNUM();
-           while (hardcount < max && scan < loceol &&
-                   ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+    case POSIXL:
+       PL_reg_flags |= RF_tainted;
+       if (! utf8_target) {
+           while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
+                                                                   *scan)))
             {
-               scan += UTF8SKIP(scan);
+               scan++;
+            }
+       } else {
+           while (hardcount < max && scan < loceol
+                   && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
+                                                                  (U8 *) scan)))
+            {
+                scan += UTF8SKIP(scan);
                hardcount++;
            }
-        } else {
-            while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
-                scan++;
-            }
-       }
-       break;
-    case NALNUM:
-       if (utf8_target)
-           goto utf8_Nwordchar;
-       while (scan < loceol && ! isALNUM((U8) *scan)) {
-           scan++;
        }
        break;
 
+    case POSIXD:
+        if (utf8_target) {
+            goto utf8_posix;
+        }
+        /* FALLTHROUGH */
+
     case POSIXA:
         if (utf8_target && scan + max < loceol) {
 
@@ -6972,232 +6929,170 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
            scan++;
        }
        break;
-    case NPOSIXA:
-       if (utf8_target) {
-           while (scan < loceol && hardcount < max
-                   && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
-            {
-               scan += UTF8SKIP(scan);
-                hardcount++;
-           }
-       }
-       else {
-           while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
-               scan++;
-           }
-       }
-       break;
-    case NALNUMA:
-       if (utf8_target) {
-           while (scan < loceol && hardcount < max
-                   && ! isWORDCHAR_A((U8) *scan))
-            {
-               scan += UTF8SKIP(scan);
-                hardcount++;
-           }
-       }
-       else {
-           while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
-               scan++;
-           }
-       }
-       break;
-    case NALNUML:
-       PL_reg_flags |= RF_tainted;
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                  !isALNUM_LC_utf8((U8*)scan)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && !isALNUM_LC(*scan))
-               scan++;
-       }
-       break;
-    case SPACEU:
-       if (utf8_target) {
 
-    utf8_space:
+    case NPOSIXD:
+        if (utf8_target) {
+            to_complement = 1;
+            goto utf8_posix;
+        }
+        /* FALL THROUGH */
 
-           while (hardcount < max && scan < loceol
-                   && is_XPERLSPACE_utf8((U8*)scan))
-            {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-           break;
-       }
-       else {
-            while (scan < loceol && isSPACE_L1((U8) *scan)) {
+    case NPOSIXA:
+        if (! utf8_target) {
+            while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
                 scan++;
             }
-           break;
-       }
-    case SPACE:
-       if (utf8_target)
-           goto utf8_space;
-
-       while (scan < loceol && isSPACE((U8) *scan)) {
-           scan++;
-       }
-       break;
-    case SPACEA:
-        if (utf8_target && scan + max < loceol) {
-
-            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
-             * since here, to match, 1 char == 1 byte */
-            loceol = scan + max;
         }
-       while (scan < loceol && isSPACE_A((U8) *scan)) {
-           scan++;
-       }
-       break;
-    case SPACEL:
-       PL_reg_flags |= RF_tainted;
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                  isSPACE_LC_utf8((U8*)scan)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && isSPACE_LC(*scan))
-               scan++;
-       }
-       break;
-    case NSPACEU:
-       if (utf8_target) {
-
-    utf8_Nspace:
+        else {
 
+            /* The complement of something that matches only ASCII matches all
+             * UTF-8 variant code points, plus everything in ASCII that isn't
+             * in the class. */
            while (hardcount < max && scan < loceol
-                   && ! is_XPERLSPACE_utf8((U8*)scan))
+                   && (! UTF8_IS_INVARIANT(*scan)
+                       || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
             {
-               scan += UTF8SKIP(scan);
+                scan += UTF8SKIP(scan);
                hardcount++;
            }
-           break;
-       }
-       else {
-            while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
-                scan++;
-            }
-       }
-       break;
-    case NSPACE:
-       if (utf8_target)
-           goto utf8_Nspace;
+        }
+        break;
 
-       while (scan < loceol && ! isSPACE((U8) *scan)) {
-           scan++;
-       }
-       break;
-    case NSPACEA:
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol
-                  && ! isSPACE_A((U8) *scan))
+    case NPOSIXU:
+        to_complement = 1;
+        /* FALLTHROUGH */
+
+    case POSIXU:
+       if (utf8_target) {
+            while (scan < loceol && to_complement
+                                ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
             {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
+                scan++;
+            }
        }
        else {
-           while (scan < loceol && ! isSPACE_A((U8) *scan)) {
-               scan++;
-           }
-       }
-       break;
-    case NSPACEL:
-       PL_reg_flags |= RF_tainted;
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                  !isSPACE_LC_utf8((U8*)scan)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && !isSPACE_LC(*scan))
-               scan++;
-       }
-       break;
-    case DIGIT:
-       if (utf8_target) {
-           LOAD_UTF8_CHARCLASS_DIGIT();
-           while (hardcount < max && scan < loceol &&
-                  swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && isDIGIT(*scan))
-               scan++;
+      utf8_posix:
+            classnum = (_char_class_number) FLAGS(p);
+            if (classnum < _FIRST_NON_SWASH_CC) {
+
+                /* Here, a swash is needed for above-Latin1 code points.
+                 * Process as many Latin1 code points using the built-in rules.
+                 * Go to another loop to finish processing upon encountering
+                 * the first Latin1 code point.  We could do that in this loop
+                 * as well, but the other way saves having to test if the swash
+                 * has been loaded every time through the loop: extra space to
+                 * save a test. */
+                while (hardcount < max && scan < loceol) {
+                    if (UTF8_IS_INVARIANT(*scan)) {
+                        if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
+                                                                   classnum))))
+                        {
+                            break;
+                        }
+                        scan++;
+                    }
+                    else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
+                        if (! (to_complement
+                              ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
+                                                                   *(scan + 1)),
+                                                    classnum))))
+                        {
+                            break;
+                        }
+                        scan += 2;
+                    }
+                    else {
+                        goto found_above_latin1;
+                    }
+
+                    hardcount++;
+                }
+            }
+            else {
+                /* For these character classes, the knowledge of how to handle
+                 * every code point is compiled in to Perl via a macro.  This
+                 * code is written for making the loops as tight as possible.
+                 * It could be refactored to save space instead */
+                switch (classnum) {
+                    case _CC_ENUM_SPACE:    /* XXX would require separate code
+                                               if we revert the change of \v
+                                               matching this */
+                        /* FALL THROUGH */
+                    case _CC_ENUM_PSXSPC:
+                        while (hardcount < max
+                               && scan < loceol
+                               && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
+                        {
+                            scan += UTF8SKIP(scan);
+                            hardcount++;
+                        }
+                        break;
+                    case _CC_ENUM_BLANK:
+                        while (hardcount < max
+                               && scan < loceol
+                               && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
+                        {
+                            scan += UTF8SKIP(scan);
+                            hardcount++;
+                        }
+                        break;
+                    case _CC_ENUM_XDIGIT:
+                        while (hardcount < max
+                               && scan < loceol
+                               && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
+                        {
+                            scan += UTF8SKIP(scan);
+                            hardcount++;
+                        }
+                        break;
+                    case _CC_ENUM_VERTSPACE:
+                        while (hardcount < max
+                               && scan < loceol
+                               && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
+                        {
+                            scan += UTF8SKIP(scan);
+                            hardcount++;
+                        }
+                        break;
+                    case _CC_ENUM_CNTRL:
+                        while (hardcount < max
+                               && scan < loceol
+                               && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
+                        {
+                            scan += UTF8SKIP(scan);
+                            hardcount++;
+                        }
+                        break;
+                    default:
+                        Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
+                }
+            }
        }
-       break;
-    case DIGITA:
-        if (utf8_target && scan + max < loceol) {
+        break;
 
-            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
-             * since here, to match, 1 char == 1 byte */
-            loceol = scan + max;
+      found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
+
+        /* Load the swash if not already present */
+        if (! PL_utf8_swash_ptrs[classnum]) {
+            U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+            PL_utf8_swash_ptrs[classnum] = _core_swash_init(
+                                        "utf8", swash_property_names[classnum],
+                                        &PL_sv_undef, 1, 0, NULL, &flags);
         }
-       while (scan < loceol && isDIGIT_A((U8) *scan)) {
-           scan++;
-       }
-       break;
-    case DIGITL:
-       PL_reg_flags |= RF_tainted;
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                  isDIGIT_LC_utf8((U8*)scan)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && isDIGIT_LC(*scan))
-               scan++;
-       }
-       break;
-    case NDIGIT:
-       if (utf8_target) {
-           LOAD_UTF8_CHARCLASS_DIGIT();
-           while (hardcount < max && scan < loceol &&
-                  !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && !isDIGIT(*scan))
-               scan++;
-       }
-       break;
-    case NDIGITA:
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol
-                  && ! isDIGIT_A((U8) *scan)) {
-               scan += UTF8SKIP(scan);
-                hardcount++;
-           }
-       }
-       else {
-           while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
-               scan++;
-           }
-       }
-       break;
-    case NDIGITL:
-       PL_reg_flags |= RF_tainted;
-       if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                  !isDIGIT_LC_utf8((U8*)scan)) {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && !isDIGIT_LC(*scan))
-               scan++;
-       }
-       break;
+
+        while (hardcount < max && scan < loceol
+               && to_complement ^ cBOOL(_generic_utf8(
+                                       classnum,
+                                       scan,
+                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
+                                                   (U8 *) scan,
+                                                   TRUE))))
+        {
+            scan += UTF8SKIP(scan);
+            hardcount++;
+        }
+        break;
+
     case LNBREAK:
         if (utf8_target) {
            while (hardcount < max && scan < loceol &&
@@ -7216,61 +7111,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
            }
        }
        break;
-    case HORIZWS:
-        if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                    (c=is_HORIZWS_utf8_safe(scan, loceol)))
-            {
-               scan += c;
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol)) 
-               scan++;         
-       }       
-       break;
-    case NHORIZWS:
-        if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                        !is_HORIZWS_utf8_safe(scan, loceol))
-            {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
-               scan++;
-
-       }       
-       break;
-    case VERTWS:
-        if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                            (c=is_VERTWS_utf8_safe(scan, loceol)))
-            {
-               scan += c;
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol)) 
-               scan++;
-
-       }       
-       break;
-    case NVERTWS:
-        if (utf8_target) {
-           while (hardcount < max && scan < loceol &&
-                                !is_VERTWS_utf8_safe(scan, loceol))
-            {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       } else {
-           while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol)) 
-               scan++;
-          
-       }       
-       break;
 
     case BOUND:
     case BOUNDA:
index 2024d15..e1fdad1 100644 (file)
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX            121
-#define REGMATCH_STATE_MAX     161
+#define REGNODE_MAX            93
+#define REGMATCH_STATE_MAX     133
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
 #define        SANY                    19      /* 0x13 Match any one character. */
 #define        CANY                    20      /* 0x14 Match any one byte. */
 #define        ANYOF                   21      /* 0x15 Match character in (or not in) this class, single char match only */
-#define        ALNUM                   22      /* 0x16 Match any alphanumeric character using native charset semantics for non-utf8 */
-#define        ALNUML                  23      /* 0x17 Match any alphanumeric char in locale */
-#define        ALNUMU                  24      /* 0x18 Match any alphanumeric char using Unicode semantics */
-#define        ALNUMA                  25      /* 0x19 Match [A-Za-z_0-9] */
-#define        NALNUM                  26      /* 0x1a Match any non-alphanumeric character using native charset semantics for non-utf8 */
-#define        NALNUML                 27      /* 0x1b Match any non-alphanumeric char in locale */
-#define        NALNUMU                 28      /* 0x1c Match any non-alphanumeric char using Unicode semantics */
-#define        NALNUMA                 29      /* 0x1d Match [^A-Za-z_0-9] */
-#define        SPACE                   30      /* 0x1e Match any whitespace character using native charset semantics for non-utf8 */
-#define        SPACEL                  31      /* 0x1f Match any whitespace char in locale */
-#define        SPACEU                  32      /* 0x20 Match any whitespace char using Unicode semantics */
-#define        SPACEA                  33      /* 0x21 Match [ \t\n\f\r] */
-#define        NSPACE                  34      /* 0x22 Match any non-whitespace character using native charset semantics for non-utf8 */
-#define        NSPACEL                 35      /* 0x23 Match any non-whitespace char in locale */
-#define        NSPACEU                 36      /* 0x24 Match any non-whitespace char using Unicode semantics */
-#define        NSPACEA                 37      /* 0x25 Match [^ \t\n\f\r] */
-#define        DIGIT                   38      /* 0x26 Match any numeric character using native charset semantics for non-utf8 */
-#define        DIGITL                  39      /* 0x27 Match any numeric character in locale */
-#define        PLACEHOLDER1            40      /* 0x28 placeholder for missing DIGITU */
-#define        DIGITA                  41      /* 0x29 Match [0-9] */
-#define        NDIGIT                  42      /* 0x2a Match any non-numeric character using native charset semantics for non-utf8 */
-#define        NDIGITL                 43      /* 0x2b Match any non-numeric character in locale */
-#define        PLACEHOLDER2            44      /* 0x2c placeholder for missing NDIGITU */
-#define        NDIGITA                 45      /* 0x2d Match [^0-9] */
-#define        POSIXD                  46      /* 0x2e currently unused except as a placeholder */
-#define        POSIXL                  47      /* 0x2f currently unused except as a placeholder */
-#define        POSIXU                  48      /* 0x30 currently unused except as a placeholder */
-#define        POSIXA                  49      /* 0x31 Some [[:class:]] under /a; the FLAGS field gives which one */
-#define        NPOSIXD                 50      /* 0x32 currently unused except as a placeholder */
-#define        NPOSIXL                 51      /* 0x33 currently unused except as a placeholder */
-#define        NPOSIXU                 52      /* 0x34 currently unused except as a placeholder */
-#define        NPOSIXA                 53      /* 0x35 complement of POSIXA, [[:^class:]] */
-#define        CLUMP                   54      /* 0x36 Match any extended grapheme cluster sequence */
-#define        BRANCH                  55      /* 0x37 Match this alternative, or the next... */
-#define        BACK                    56      /* 0x38 Match "", "next" ptr points backward. */
-#define        EXACT                   57      /* 0x39 Match this string (preceded by length). */
-#define        EXACTF                  58      /* 0x3a Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
-#define        EXACTFL                 59      /* 0x3b Match this string (not guaranteed to be folded) using /il rules (w/len). */
-#define        EXACTFU                 60      /* 0x3c Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
-#define        EXACTFA                 61      /* 0x3d Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
-#define        EXACTFU_SS              62      /* 0x3e Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define        EXACTFU_TRICKYFOLD      63      /* 0x3f Match this folded UTF-8 string using /iu rules */
-#define        NOTHING                 64      /* 0x40 Match empty string. */
-#define        TAIL                    65      /* 0x41 Match empty string. Can jump here from outside. */
-#define        STAR                    66      /* 0x42 Match this (simple) thing 0 or more times. */
-#define        PLUS                    67      /* 0x43 Match this (simple) thing 1 or more times. */
-#define        CURLY                   68      /* 0x44 Match this simple thing {n,m} times. */
-#define        CURLYN                  69      /* 0x45 Capture next-after-this simple thing */
-#define        CURLYM                  70      /* 0x46 Capture this medium-complex thing {n,m} times. */
-#define        CURLYX                  71      /* 0x47 Match this complex thing {n,m} times. */
-#define        WHILEM                  72      /* 0x48 Do curly processing and see if rest matches. */
-#define        OPEN                    73      /* 0x49 Mark this point in input as start of */
-#define        CLOSE                   74      /* 0x4a Analogous to OPEN. */
-#define        REF                     75      /* 0x4b Match some already matched string */
-#define        REFF                    76      /* 0x4c Match already matched string, folded using native charset semantics for non-utf8 */
-#define        REFFL                   77      /* 0x4d Match already matched string, folded in loc. */
-#define        REFFU                   78      /* 0x4e Match already matched string, folded using unicode semantics for non-utf8 */
-#define        REFFA                   79      /* 0x4f Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define        NREF                    80      /* 0x50 Match some already matched string */
-#define        NREFF                   81      /* 0x51 Match already matched string, folded using native charset semantics for non-utf8 */
-#define        NREFFL                  82      /* 0x52 Match already matched string, folded in loc. */
-#define        NREFFU                  83      /* 0x53 Match already matched string, folded using unicode semantics for non-utf8 */
-#define        NREFFA                  84      /* 0x54 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define        IFMATCH                 85      /* 0x55 Succeeds if the following matches. */
-#define        UNLESSM                 86      /* 0x56 Fails if the following matches. */
-#define        SUSPEND                 87      /* 0x57 "Independent" sub-RE. */
-#define        IFTHEN                  88      /* 0x58 Switch, should be preceded by switcher . */
-#define        GROUPP                  89      /* 0x59 Whether the group matched. */
-#define        LONGJMP                 90      /* 0x5a Jump far away. */
-#define        BRANCHJ                 91      /* 0x5b BRANCH with long offset. */
-#define        EVAL                    92      /* 0x5c Execute some Perl code. */
-#define        MINMOD                  93      /* 0x5d Next operator is not greedy. */
-#define        LOGICAL                 94      /* 0x5e Next opcode should set the flag only. */
-#define        RENUM                   95      /* 0x5f Group with independently numbered parens. */
-#define        TRIE                    96      /* 0x60 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define        TRIEC                   97      /* 0x61 Same as TRIE, but with embedded charclass data */
-#define        AHOCORASICK             98      /* 0x62 Aho Corasick stclass. flags==type */
-#define        AHOCORASICKC            99      /* 0x63 Same as AHOCORASICK, but with embedded charclass data */
-#define        GOSUB                   100     /* 0x64 recurse to paren arg1 at (signed) ofs arg2 */
-#define        GOSTART                 101     /* 0x65 recurse to start of pattern */
-#define        NGROUPP                 102     /* 0x66 Whether the group matched. */
-#define        INSUBP                  103     /* 0x67 Whether we are in a specific recurse. */
-#define        DEFINEP                 104     /* 0x68 Never execute directly. */
-#define        ENDLIKE                 105     /* 0x69 Used only for the type field of verbs */
-#define        OPFAIL                  106     /* 0x6a Same as (?!) */
-#define        ACCEPT                  107     /* 0x6b Accepts the current matched string. */
-#define        VERB                    108     /* 0x6c Used only for the type field of verbs */
-#define        PRUNE                   109     /* 0x6d Pattern fails at this startpoint if no-backtracking through this */
-#define        MARKPOINT               110     /* 0x6e Push the current location for rollback by cut. */
-#define        SKIP                    111     /* 0x6f On failure skip forward (to the mark) before retrying */
-#define        COMMIT                  112     /* 0x70 Pattern fails outright if backtracking through this */
-#define        CUTGROUP                113     /* 0x71 On failure go to the next alternation in the group */
-#define        KEEPS                   114     /* 0x72 $& begins here. */
-#define        LNBREAK                 115     /* 0x73 generic newline pattern */
-#define        VERTWS                  116     /* 0x74 vertical whitespace         (Perl 6) */
-#define        NVERTWS                 117     /* 0x75 not vertical whitespace     (Perl 6) */
-#define        HORIZWS                 118     /* 0x76 horizontal whitespace       (Perl 6) */
-#define        NHORIZWS                119     /* 0x77 not horizontal whitespace   (Perl 6) */
-#define        OPTIMIZED               120     /* 0x78 Placeholder for dump. */
-#define        PSEUDO                  121     /* 0x79 Pseudo opcode for internal use. */
+#define        POSIXD                  22      /* 0x16 Some [[:class:]] under /d; the FLAGS field gives which one */
+#define        POSIXL                  23      /* 0x17 Some [[:class:]] under /l; the FLAGS field gives which one */
+#define        POSIXU                  24      /* 0x18 Some [[:class:]] under /u; the FLAGS field gives which one */
+#define        POSIXA                  25      /* 0x19 Some [[:class:]] under /a; the FLAGS field gives which one */
+#define        NPOSIXD                 26      /* 0x1a complement of POSIXD, [[:^class:]] */
+#define        NPOSIXL                 27      /* 0x1b complement of POSIXL, [[:^class:]] */
+#define        NPOSIXU                 28      /* 0x1c complement of POSIXU, [[:^class:]] */
+#define        NPOSIXA                 29      /* 0x1d complement of POSIXA, [[:^class:]] */
+#define        CLUMP                   30      /* 0x1e Match any extended grapheme cluster sequence */
+#define        BRANCH                  31      /* 0x1f Match this alternative, or the next... */
+#define        BACK                    32      /* 0x20 Match "", "next" ptr points backward. */
+#define        EXACT                   33      /* 0x21 Match this string (preceded by length). */
+#define        EXACTF                  34      /* 0x22 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
+#define        EXACTFL                 35      /* 0x23 Match this string (not guaranteed to be folded) using /il rules (w/len). */
+#define        EXACTFU                 36      /* 0x24 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
+#define        EXACTFA                 37      /* 0x25 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
+#define        EXACTFU_SS              38      /* 0x26 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
+#define        EXACTFU_TRICKYFOLD      39      /* 0x27 Match this folded UTF-8 string using /iu rules */
+#define        NOTHING                 40      /* 0x28 Match empty string. */
+#define        TAIL                    41      /* 0x29 Match empty string. Can jump here from outside. */
+#define        STAR                    42      /* 0x2a Match this (simple) thing 0 or more times. */
+#define        PLUS                    43      /* 0x2b Match this (simple) thing 1 or more times. */
+#define        CURLY                   44      /* 0x2c Match this simple thing {n,m} times. */
+#define        CURLYN                  45      /* 0x2d Capture next-after-this simple thing */
+#define        CURLYM                  46      /* 0x2e Capture this medium-complex thing {n,m} times. */
+#define        CURLYX                  47      /* 0x2f Match this complex thing {n,m} times. */
+#define        WHILEM                  48      /* 0x30 Do curly processing and see if rest matches. */
+#define        OPEN                    49      /* 0x31 Mark this point in input as start of */
+#define        CLOSE                   50      /* 0x32 Analogous to OPEN. */
+#define        REF                     51      /* 0x33 Match some already matched string */
+#define        REFF                    52      /* 0x34 Match already matched string, folded using native charset semantics for non-utf8 */
+#define        REFFL                   53      /* 0x35 Match already matched string, folded in loc. */
+#define        REFFU                   54      /* 0x36 Match already matched string, folded using unicode semantics for non-utf8 */
+#define        REFFA                   55      /* 0x37 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define        NREF                    56      /* 0x38 Match some already matched string */
+#define        NREFF                   57      /* 0x39 Match already matched string, folded using native charset semantics for non-utf8 */
+#define        NREFFL                  58      /* 0x3a Match already matched string, folded in loc. */
+#define        NREFFU                  59      /* 0x3b Match already matched string, folded using unicode semantics for non-utf8 */
+#define        NREFFA                  60      /* 0x3c Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define        IFMATCH                 61      /* 0x3d Succeeds if the following matches. */
+#define        UNLESSM                 62      /* 0x3e Fails if the following matches. */
+#define        SUSPEND                 63      /* 0x3f "Independent" sub-RE. */
+#define        IFTHEN                  64      /* 0x40 Switch, should be preceded by switcher . */
+#define        GROUPP                  65      /* 0x41 Whether the group matched. */
+#define        LONGJMP                 66      /* 0x42 Jump far away. */
+#define        BRANCHJ                 67      /* 0x43 BRANCH with long offset. */
+#define        EVAL                    68      /* 0x44 Execute some Perl code. */
+#define        MINMOD                  69      /* 0x45 Next operator is not greedy. */
+#define        LOGICAL                 70      /* 0x46 Next opcode should set the flag only. */
+#define        RENUM                   71      /* 0x47 Group with independently numbered parens. */
+#define        TRIE                    72      /* 0x48 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define        TRIEC                   73      /* 0x49 Same as TRIE, but with embedded charclass data */
+#define        AHOCORASICK             74      /* 0x4a Aho Corasick stclass. flags==type */
+#define        AHOCORASICKC            75      /* 0x4b Same as AHOCORASICK, but with embedded charclass data */
+#define        GOSUB                   76      /* 0x4c recurse to paren arg1 at (signed) ofs arg2 */
+#define        GOSTART                 77      /* 0x4d recurse to start of pattern */
+#define        NGROUPP                 78      /* 0x4e Whether the group matched. */
+#define        INSUBP                  79      /* 0x4f Whether we are in a specific recurse. */
+#define        DEFINEP                 80      /* 0x50 Never execute directly. */
+#define        ENDLIKE                 81      /* 0x51 Used only for the type field of verbs */
+#define        OPFAIL                  82      /* 0x52 Same as (?!) */
+#define        ACCEPT                  83      /* 0x53 Accepts the current matched string. */
+#define        VERB                    84      /* 0x54 Used only for the type field of verbs */
+#define        PRUNE                   85      /* 0x55 Pattern fails at this startpoint if no-backtracking through this */
+#define        MARKPOINT               86      /* 0x56 Push the current location for rollback by cut. */
+#define        SKIP                    87      /* 0x57 On failure skip forward (to the mark) before retrying */
+#define        COMMIT                  88      /* 0x58 Pattern fails outright if backtracking through this */
+#define        CUTGROUP                89      /* 0x59 On failure go to the next alternation in the group */
+#define        KEEPS                   90      /* 0x5a $& begins here. */
+#define        LNBREAK                 91      /* 0x5b generic newline pattern */
+#define        OPTIMIZED               92      /* 0x5c Placeholder for dump. */
+#define        PSEUDO                  93      /* 0x5d Pseudo opcode for internal use. */
        /* ------------ States ------------- */
 #define        TRIE_next               (REGNODE_MAX + 1)       /* state for TRIE */
 #define        TRIE_next_fail          (REGNODE_MAX + 2)       /* state for TRIE */
@@ -201,30 +173,6 @@ EXTCONST U8 PL_regkind[] = {
        REG_ANY,        /* SANY                   */
        REG_ANY,        /* CANY                   */
        ANYOF,          /* ANYOF                  */
-       ALNUM,          /* ALNUM                  */
-       ALNUM,          /* ALNUML                 */
-       ALNUM,          /* ALNUMU                 */
-       ALNUM,          /* ALNUMA                 */
-       NALNUM,         /* NALNUM                 */
-       NALNUM,         /* NALNUML                */
-       NALNUM,         /* NALNUMU                */
-       NALNUM,         /* NALNUMA                */
-       SPACE,          /* SPACE                  */
-       SPACE,          /* SPACEL                 */
-       SPACE,          /* SPACEU                 */
-       SPACE,          /* SPACEA                 */
-       NSPACE,         /* NSPACE                 */
-       NSPACE,         /* NSPACEL                */
-       NSPACE,         /* NSPACEU                */
-       NSPACE,         /* NSPACEA                */
-       DIGIT,          /* DIGIT                  */
-       DIGIT,          /* DIGITL                 */
-       NOTHING,        /* PLACEHOLDER1           */
-       DIGIT,          /* DIGITA                 */
-       NDIGIT,         /* NDIGIT                 */
-       NDIGIT,         /* NDIGITL                */
-       NOTHING,        /* PLACEHOLDER2           */
-       NDIGIT,         /* NDIGITA                */
        POSIXD,         /* POSIXD                 */
        POSIXD,         /* POSIXL                 */
        POSIXD,         /* POSIXU                 */
@@ -295,10 +243,6 @@ EXTCONST U8 PL_regkind[] = {
        VERB,           /* CUTGROUP               */
        KEEPS,          /* KEEPS                  */
        LNBREAK,        /* LNBREAK                */
-       VERTWS,         /* VERTWS                 */
-       NVERTWS,        /* NVERTWS                */
-       HORIZWS,        /* HORIZWS                */
-       NHORIZWS,       /* NHORIZWS               */
        NOTHING,        /* OPTIMIZED              */
        PSEUDO,         /* PSEUDO                 */
        /* ------------ States ------------- */
@@ -371,30 +315,6 @@ static const U8 regarglen[] = {
        0,                                      /* SANY         */
        0,                                      /* CANY         */
        0,                                      /* ANYOF        */
-       0,                                      /* ALNUM        */
-       0,                                      /* ALNUML       */
-       0,                                      /* ALNUMU       */
-       0,                                      /* ALNUMA       */
-       0,                                      /* NALNUM       */
-       0,                                      /* NALNUML      */
-       0,                                      /* NALNUMU      */
-       0,                                      /* NALNUMA      */
-       0,                                      /* SPACE        */
-       0,                                      /* SPACEL       */
-       0,                                      /* SPACEU       */
-       0,                                      /* SPACEA       */
-       0,                                      /* NSPACE       */
-       0,                                      /* NSPACEL      */
-       0,                                      /* NSPACEU      */
-       0,                                      /* NSPACEA      */
-       0,                                      /* DIGIT        */
-       0,                                      /* DIGITL       */
-       0,                                      /* PLACEHOLDER1 */
-       0,                                      /* DIGITA       */
-       0,                                      /* NDIGIT       */
-       0,                                      /* NDIGITL      */
-       0,                                      /* PLACEHOLDER2 */
-       0,                                      /* NDIGITA      */
        0,                                      /* POSIXD       */
        0,                                      /* POSIXL       */
        0,                                      /* POSIXU       */
@@ -465,10 +385,6 @@ static const U8 regarglen[] = {
        EXTRA_SIZE(struct regnode_1),           /* CUTGROUP     */
        0,                                      /* KEEPS        */
        0,                                      /* LNBREAK      */
-       0,                                      /* VERTWS       */
-       0,                                      /* NVERTWS      */
-       0,                                      /* HORIZWS      */
-       0,                                      /* NHORIZWS     */
        0,                                      /* OPTIMIZED    */
        0,                                      /* PSEUDO       */
 };
@@ -498,30 +414,6 @@ static const char reg_off_by_arg[] = {
        0,      /* SANY         */
        0,      /* CANY         */
        0,      /* ANYOF        */
-       0,      /* ALNUM        */
-       0,      /* ALNUML       */
-       0,      /* ALNUMU       */
-       0,      /* ALNUMA       */
-       0,      /* NALNUM       */
-       0,      /* NALNUML      */
-       0,      /* NALNUMU      */
-       0,      /* NALNUMA      */
-       0,      /* SPACE        */
-       0,      /* SPACEL       */
-       0,      /* SPACEU       */
-       0,      /* SPACEA       */
-       0,      /* NSPACE       */
-       0,      /* NSPACEL      */
-       0,      /* NSPACEU      */
-       0,      /* NSPACEA      */
-       0,      /* DIGIT        */
-       0,      /* DIGITL       */
-       0,      /* PLACEHOLDER1 */
-       0,      /* DIGITA       */
-       0,      /* NDIGIT       */
-       0,      /* NDIGITL      */
-       0,      /* PLACEHOLDER2 */
-       0,      /* NDIGITA      */
        0,      /* POSIXD       */
        0,      /* POSIXL       */
        0,      /* POSIXU       */
@@ -592,10 +484,6 @@ static const char reg_off_by_arg[] = {
        0,      /* CUTGROUP     */
        0,      /* KEEPS        */
        0,      /* LNBREAK      */
-       0,      /* VERTWS       */
-       0,      /* NVERTWS      */
-       0,      /* HORIZWS      */
-       0,      /* NHORIZWS     */
        0,      /* OPTIMIZED    */
        0,      /* PSEUDO       */
 };
@@ -630,106 +518,78 @@ EXTCONST char * const PL_reg_name[] = {
        "SANY",                         /* 0x13 */
        "CANY",                         /* 0x14 */
        "ANYOF",                        /* 0x15 */
-       "ALNUM",                        /* 0x16 */
-       "ALNUML",                       /* 0x17 */
-       "ALNUMU",                       /* 0x18 */
-       "ALNUMA",                       /* 0x19 */
-       "NALNUM",                       /* 0x1a */
-       "NALNUML",                      /* 0x1b */
-       "NALNUMU",                      /* 0x1c */
-       "NALNUMA",                      /* 0x1d */
-       "SPACE",                        /* 0x1e */
-       "SPACEL",                       /* 0x1f */
-       "SPACEU",                       /* 0x20 */
-       "SPACEA",                       /* 0x21 */
-       "NSPACE",                       /* 0x22 */
-       "NSPACEL",                      /* 0x23 */
-       "NSPACEU",                      /* 0x24 */
-       "NSPACEA",                      /* 0x25 */
-       "DIGIT",                        /* 0x26 */
-       "DIGITL",                       /* 0x27 */
-       "PLACEHOLDER1",                 /* 0x28 */
-       "DIGITA",                       /* 0x29 */
-       "NDIGIT",                       /* 0x2a */
-       "NDIGITL",                      /* 0x2b */
-       "PLACEHOLDER2",                 /* 0x2c */
-       "NDIGITA",                      /* 0x2d */
-       "POSIXD",                       /* 0x2e */
-       "POSIXL",                       /* 0x2f */
-       "POSIXU",                       /* 0x30 */
-       "POSIXA",                       /* 0x31 */
-       "NPOSIXD",                      /* 0x32 */
-       "NPOSIXL",                      /* 0x33 */
-       "NPOSIXU",                      /* 0x34 */
-       "NPOSIXA",                      /* 0x35 */
-       "CLUMP",                        /* 0x36 */
-       "BRANCH",                       /* 0x37 */
-       "BACK",                         /* 0x38 */
-       "EXACT",                        /* 0x39 */
-       "EXACTF",                       /* 0x3a */
-       "EXACTFL",                      /* 0x3b */
-       "EXACTFU",                      /* 0x3c */
-       "EXACTFA",                      /* 0x3d */
-       "EXACTFU_SS",                   /* 0x3e */
-       "EXACTFU_TRICKYFOLD",           /* 0x3f */
-       "NOTHING",                      /* 0x40 */
-       "TAIL",                         /* 0x41 */
-       "STAR",                         /* 0x42 */
-       "PLUS",                         /* 0x43 */
-       "CURLY",                        /* 0x44 */
-       "CURLYN",                       /* 0x45 */
-       "CURLYM",                       /* 0x46 */
-       "CURLYX",                       /* 0x47 */
-       "WHILEM",                       /* 0x48 */
-       "OPEN",                         /* 0x49 */
-       "CLOSE",                        /* 0x4a */
-       "REF",                          /* 0x4b */
-       "REFF",                         /* 0x4c */
-       "REFFL",                        /* 0x4d */
-       "REFFU",                        /* 0x4e */
-       "REFFA",                        /* 0x4f */
-       "NREF",                         /* 0x50 */
-       "NREFF",                        /* 0x51 */
-       "NREFFL",                       /* 0x52 */
-       "NREFFU",                       /* 0x53 */
-       "NREFFA",                       /* 0x54 */
-       "IFMATCH",                      /* 0x55 */
-       "UNLESSM",                      /* 0x56 */
-       "SUSPEND",                      /* 0x57 */
-       "IFTHEN",                       /* 0x58 */
-       "GROUPP",                       /* 0x59 */
-       "LONGJMP",                      /* 0x5a */
-       "BRANCHJ",                      /* 0x5b */
-       "EVAL",                         /* 0x5c */
-       "MINMOD",                       /* 0x5d */
-       "LOGICAL",                      /* 0x5e */
-       "RENUM",                        /* 0x5f */
-       "TRIE",                         /* 0x60 */
-       "TRIEC",                        /* 0x61 */
-       "AHOCORASICK",                  /* 0x62 */
-       "AHOCORASICKC",                 /* 0x63 */
-       "GOSUB",                        /* 0x64 */
-       "GOSTART",                      /* 0x65 */
-       "NGROUPP",                      /* 0x66 */
-       "INSUBP",                       /* 0x67 */
-       "DEFINEP",                      /* 0x68 */
-       "ENDLIKE",                      /* 0x69 */
-       "OPFAIL",                       /* 0x6a */
-       "ACCEPT",                       /* 0x6b */
-       "VERB",                         /* 0x6c */
-       "PRUNE",                        /* 0x6d */
-       "MARKPOINT",                    /* 0x6e */
-       "SKIP",                         /* 0x6f */
-       "COMMIT",                       /* 0x70 */
-       "CUTGROUP",                     /* 0x71 */
-       "KEEPS",                        /* 0x72 */
-       "LNBREAK",                      /* 0x73 */
-       "VERTWS",                       /* 0x74 */
-       "NVERTWS",                      /* 0x75 */
-       "HORIZWS",                      /* 0x76 */
-       "NHORIZWS",                     /* 0x77 */
-       "OPTIMIZED",                    /* 0x78 */
-       "PSEUDO",                       /* 0x79 */
+       "POSIXD",                       /* 0x16 */
+       "POSIXL",                       /* 0x17 */
+       "POSIXU",                       /* 0x18 */
+       "POSIXA",                       /* 0x19 */
+       "NPOSIXD",                      /* 0x1a */
+       "NPOSIXL",                      /* 0x1b */
+       "NPOSIXU",                      /* 0x1c */
+       "NPOSIXA",                      /* 0x1d */
+       "CLUMP",                        /* 0x1e */
+       "BRANCH",                       /* 0x1f */
+       "BACK",                         /* 0x20 */
+       "EXACT",                        /* 0x21 */
+       "EXACTF",                       /* 0x22 */
+       "EXACTFL",                      /* 0x23 */
+       "EXACTFU",                      /* 0x24 */
+       "EXACTFA",                      /* 0x25 */
+       "EXACTFU_SS",                   /* 0x26 */
+       "EXACTFU_TRICKYFOLD",           /* 0x27 */
+       "NOTHING",                      /* 0x28 */
+       "TAIL",                         /* 0x29 */
+       "STAR",                         /* 0x2a */
+       "PLUS",                         /* 0x2b */
+       "CURLY",                        /* 0x2c */
+       "CURLYN",                       /* 0x2d */
+       "CURLYM",                       /* 0x2e */
+       "CURLYX",                       /* 0x2f */
+       "WHILEM",                       /* 0x30 */
+       "OPEN",                         /* 0x31 */
+       "CLOSE",                        /* 0x32 */
+       "REF",                          /* 0x33 */
+       "REFF",                         /* 0x34 */
+       "REFFL",                        /* 0x35 */
+       "REFFU",                        /* 0x36 */
+       "REFFA",                        /* 0x37 */
+       "NREF",                         /* 0x38 */
+       "NREFF",                        /* 0x39 */
+       "NREFFL",                       /* 0x3a */
+       "NREFFU",                       /* 0x3b */
+       "NREFFA",                       /* 0x3c */
+       "IFMATCH",                      /* 0x3d */
+       "UNLESSM",                      /* 0x3e */
+       "SUSPEND",                      /* 0x3f */
+       "IFTHEN",                       /* 0x40 */
+       "GROUPP",                       /* 0x41 */
+       "LONGJMP",                      /* 0x42 */
+       "BRANCHJ",                      /* 0x43 */
+       "EVAL",                         /* 0x44 */
+       "MINMOD",                       /* 0x45 */
+       "LOGICAL",                      /* 0x46 */
+       "RENUM",                        /* 0x47 */
+       "TRIE",                         /* 0x48 */
+       "TRIEC",                        /* 0x49 */
+       "AHOCORASICK",                  /* 0x4a */
+       "AHOCORASICKC",                 /* 0x4b */
+       "GOSUB",                        /* 0x4c */
+       "GOSTART",                      /* 0x4d */
+       "NGROUPP",                      /* 0x4e */
+       "INSUBP",                       /* 0x4f */
+       "DEFINEP",                      /* 0x50 */
+       "ENDLIKE",                      /* 0x51 */
+       "OPFAIL",                       /* 0x52 */
+       "ACCEPT",                       /* 0x53 */
+       "VERB",                         /* 0x54 */
+       "PRUNE",                        /* 0x55 */
+       "MARKPOINT",                    /* 0x56 */
+       "SKIP",                         /* 0x57 */
+       "COMMIT",                       /* 0x58 */
+       "CUTGROUP",                     /* 0x59 */
+       "KEEPS",                        /* 0x5a */
+       "LNBREAK",                      /* 0x5b */
+       "OPTIMIZED",                    /* 0x5c */
+       "PSEUDO",                       /* 0x5d */
        /* ------------ States ------------- */
        "TRIE_next",                    /* REGNODE_MAX +0x01 */
        "TRIE_next_fail",               /* REGNODE_MAX +0x02 */
@@ -834,7 +694,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
 EXTCONST U8 PL_varies_bitmask[];
 #else
 EXTCONST U8 PL_varies_bitmask[] = {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00, 0x00
+    0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
@@ -846,11 +706,8 @@ EXTCONST U8 PL_varies_bitmask[] = {
 EXTCONST U8 PL_simple[] __attribute__deprecated__;
 #else
 EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
-    REG_ANY, SANY, CANY, ANYOF, ALNUM, ALNUML, ALNUMU, ALNUMA, NALNUM,
-    NALNUML, NALNUMU, NALNUMA, SPACE, SPACEL, SPACEU, SPACEA, NSPACE,
-    NSPACEL, NSPACEU, NSPACEA, DIGIT, DIGITL, DIGITA, NDIGIT, NDIGITL,
-    NDIGITA, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD, NPOSIXL, NPOSIXU,
-    NPOSIXA, VERTWS, NVERTWS, HORIZWS, NHORIZWS,
+    REG_ANY, SANY, CANY, ANYOF, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD,
+    NPOSIXL, NPOSIXU, NPOSIXA,
     0
 };
 #endif /* DOINIT */
@@ -859,7 +716,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
 EXTCONST U8 PL_simple_bitmask[];
 #else
 EXTCONST U8 PL_simple_bitmask[] = {
-    0x00, 0x00, 0xFC, 0xFF, 0xFF, 0xEE, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x00
+    0x00, 0x00, 0xFC, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */