This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Optimize e.g., /[^\w]/, /[[^:word:]]/ into /\W/
authorKarl Williamson <public@khwilliamson.com>
Wed, 27 Jun 2012 20:43:41 +0000 (14:43 -0600)
committerKarl Williamson <public@khwilliamson.com>
Sat, 30 Jun 2012 04:22:42 +0000 (22:22 -0600)
This optimizes character classes that have a single element that is one
of the ops that have the same meaning outside (namely \d, \h, \s, \w,
\v, :word:, :digit: and their complements) to that op.  Those
ops take less space than a character class and run faster.   An initial
'^' for complementing the class is also handled.

regcomp.c
regcomp.sym

index 155ef09..22a220f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10978,6 +10978,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
                               Optimizations may be possible if this is tiny */
     UV n;
 
+    /* Certain named classes have equivalents that can appear outside a
+     * character class, e.g. \w.  These flags are set for these classes.  The
+     * first flag indicates the op depends on the character set modifier, like
+     * /d, /u....  The second is for those that don't have this dependency. */
+    bool has_special_charset_op = FALSE;
+    bool has_special_non_charset_op = FALSE;
+
     /* Unicode properties are stored in a swash; this holds the current one
      * being parsed.  If this swash is the only above-latin1 component of the
      * character class, an optimization is to pass it directly on to the
@@ -11371,14 +11378,44 @@ parseit:
                }
 
                range = 0; /* this was not a true range */
+                element_count += 2; /* So counts for three values */
            }
 
-           if (!SIZE_ONLY) {
+           if (SIZE_ONLY) {
+
+                /* In the first pass, do a little extra work so below can
+                 * possibly optimize the whole node to one of the nodes that
+                 * correspond to the classes given below */
+
+                /* The optimization will only take place if there is a single
+                 * element in the class, so can skip if there is more than one
+                 */
+                if (element_count == 1) {
 
                /* Possible truncation here but in some 64-bit environments
                 * the compiler gets heartburn about switch on 64-bit values.
                 * A similar issue a little earlier when switching on value.
                 * --jhi */
+                    switch ((I32)namedclass) {
+                        case ANYOF_ALNUM:
+                        case ANYOF_NALNUM:
+                        case ANYOF_DIGIT:
+                        case ANYOF_NDIGIT:
+                        case ANYOF_SPACE:
+                        case ANYOF_NSPACE:
+                            has_special_charset_op = TRUE;
+                            break;
+
+                        case ANYOF_HORIZWS:
+                        case ANYOF_NHORIZWS:
+                        case ANYOF_VERTWS:
+                        case ANYOF_NVERTWS:
+                            has_special_non_charset_op = TRUE;
+                            break;
+                    }
+                }
+            }
+            else {
                switch ((I32)namedclass) {
 
                case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
@@ -11439,10 +11476,12 @@ parseit:
                     * them */
                    DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
                         PL_PosixDigit, "XPosixDigit", listsv);
+                    has_special_charset_op = TRUE;
                    break;
                case ANYOF_NDIGIT:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+                    has_special_charset_op = TRUE;
                    break;
                case ANYOF_GRAPH:
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
@@ -11459,10 +11498,12 @@ parseit:
                     * cp_list is subject to folding.  It turns out that \h
                     * is just a synonym for XPosixBlank */
                    _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
+                    has_special_non_charset_op = TRUE;
                    break;
                case ANYOF_NHORIZWS:
                     _invlist_union_complement_2nd(cp_list,
                                                  PL_XPosixBlank, &cp_list);
+                    has_special_non_charset_op = TRUE;
                    break;
                case ANYOF_LOWER:
                case ANYOF_NLOWER:
@@ -11521,10 +11562,12 @@ parseit:
                case ANYOF_SPACE:
                     DO_POSIX(ret, namedclass, properties,
                                             PL_PerlSpace, PL_XPerlSpace);
+                    has_special_charset_op = TRUE;
                    break;
                case ANYOF_NSPACE:
                     DO_N_POSIX(ret, namedclass, properties,
                                             PL_PerlSpace, PL_XPerlSpace);
+                    has_special_charset_op = TRUE;
                    break;
                case ANYOF_UPPER:   /* Same as LOWER, above */
                case ANYOF_NUPPER:
@@ -11556,10 +11599,12 @@ parseit:
                case ANYOF_ALNUM:   /* Really is 'Word' */
                    DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                    has_special_charset_op = TRUE;
                    break;
                case ANYOF_NALNUM:
                    DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+                    has_special_charset_op = TRUE;
                    break;
                case ANYOF_VERTWS:
                    /* For these, we use the cp_list, as /d doesn't make a
@@ -11567,10 +11612,12 @@ parseit:
                     * if these characters had folds other than themselves, as
                     * cp_list is subject to folding */
                    _invlist_union(cp_list, PL_VertSpace, &cp_list);
+                    has_special_non_charset_op = TRUE;
                    break;
                case ANYOF_NVERTWS:
                     _invlist_union_complement_2nd(cp_list,
                                                     PL_VertSpace, &cp_list);
+                    has_special_non_charset_op = TRUE;
                    break;
                case ANYOF_XDIGIT:
                     DO_POSIX(ret, namedclass, properties,
@@ -11662,6 +11709,93 @@ parseit:
        range = 0; /* this range (if it was one) is done now */
     }
 
+    /* [\w] can be optimized into \w, but not if there is anything else in the
+     * brackets (except for an initial '^' which indictes omplementing) */
+    if (element_count == 1 && (has_special_charset_op || has_special_non_charset_op)) {
+        U8 op;
+        bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT;
+        const char * cur_parse = RExC_parse;
+
+        if (has_special_charset_op) {
+            U8 offset = get_regex_charset(RExC_flags);
+
+            /* /aa is the same as /a for these */
+            if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
+                offset = REGEX_ASCII_RESTRICTED_CHARSET;
+            }
+            switch ((I32)namedclass) {
+                case ANYOF_NALNUM:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_ALNUM:
+                    op = ALNUM;
+                    break;
+                case ANYOF_NSPACE:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_SPACE:
+                    op = SPACE;
+                    break;
+                case ANYOF_NDIGIT:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_DIGIT:
+                    op = DIGIT;
+
+                    /* There is no DIGITU */
+                    if (offset == REGEX_UNICODE_CHARSET) {
+                        offset = REGEX_DEPENDS_CHARSET;
+                    }
+                    break;
+                default:
+                    Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
+            }
+
+            /* The number of varieties of each of these is the same, hence, so
+             * is the delta between the normal and complemented nodes */
+            if (invert) {
+                offset += NALNUM - ALNUM;
+            }
+
+            op += offset;
+        }
+        else if (has_special_non_charset_op) {
+            switch ((I32)namedclass) {
+                case ANYOF_NHORIZWS:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_HORIZWS:
+                    op = HORIZWS;
+                    break;
+                case ANYOF_NVERTWS:
+                    invert = ! invert;
+                    /* FALLTHROUGH */
+                case ANYOF_VERTWS:
+                    op = VERTWS;
+                    break;
+                default:
+                    Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
+            }
+
+            /* The complement version of each of these nodes is adjacently next
+             * */
+            if (invert) {
+                op++;
+            }
+        }
+
+        /* Throw away this ANYOF regnode, and emit the calculated one, which
+         * should correspond to the beginning, not current, state of the parse
+         */
+        RExC_parse = (char *)orig_parse;
+        RExC_emit = (regnode *)orig_emit;
+        ret = reg_node(pRExC_state, op);
+        RExC_parse = (char *) cur_parse;
+
+        SvREFCNT_dec(listsv);
+        return ret;
+    }
+
     if (SIZE_ONLY)
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
index c36a7fc..0865a73 100644 (file)
@@ -227,6 +227,9 @@ KEEPS       KEEPS,      no        ; $& begins here.
 
 #*New charclass like patterns
 LNBREAK     LNBREAK,    none      ; generic newline pattern
+
+# regcomp.c expects the node number of the complement to be one greater than
+# the non-complement
 VERTWS      VERTWS,     none 0 S  ; vertical whitespace         (Perl 6)
 NVERTWS     NVERTWS,    none 0 S  ; not vertical whitespace     (Perl 6)
 HORIZWS     HORIZWS,    none 0 S  ; horizontal whitespace       (Perl 6)