This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
make the EXACTF_invlist only when SCF_DO_STCLASS
authorHugo van der Sanden <hv@crypt.org>
Thu, 11 Dec 2014 15:27:07 +0000 (15:27 +0000)
committerHugo van der Sanden <hv@crypt.org>
Thu, 11 Dec 2014 15:27:07 +0000 (15:27 +0000)
The data is used only for STCLASS, and it's somewhat expensive to create.

embed.fnc
embed.h
proto.h
regcomp.c

index 963a00f..16c33a2 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1528,6 +1528,8 @@ EiMn      |void   |invlist_iterinit|NN SV* invlist
 EsMRn  |bool   |invlist_iternext|NN SV* invlist|NN UV* start|NN UV* end
 EiMn   |void   |invlist_iterfinish|NN SV* invlist
 EiMRn  |UV     |invlist_highest|NN SV* const invlist
+EMRs   |SV*    |_make_exactf_invlist   |NN RExC_state_t *pRExC_state \
+                                       |NN regnode *node
 #endif
 #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C)
 EXmM   |void   |_invlist_intersection  |NN SV* const a|NN SV* const b|NN SV** i
diff --git a/embed.h b/embed.h
index 2bf125e..8e71a42 100644 (file)
--- a/embed.h
+++ b/embed.h
 #  if defined(PERL_IN_REGCOMP_C)
 #define _append_range_to_invlist(a,b,c)        S__append_range_to_invlist(aTHX_ a,b,c)
 #define _invlist_array_init    S__invlist_array_init
+#define _make_exactf_invlist(a,b)      S__make_exactf_invlist(aTHX_ a,b)
 #define add_above_Latin1_folds(a,b,c)  S_add_above_Latin1_folds(aTHX_ a,b,c)
 #define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b)
 #define add_data               S_add_data
diff --git a/proto.h b/proto.h
index 3345d1c..ce86fca 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6864,6 +6864,13 @@ PERL_STATIC_INLINE UV*   S__invlist_array_init(SV* const invlist, const bool will_
 #define PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT   \
        assert(invlist)
 
+STATIC SV*     S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST  \
+       assert(pRExC_state); assert(node)
+
 STATIC void    S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_3);
index a1784c7..e5d6a76 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4513,9 +4513,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
         else if (PL_regkind[OP(scan)] == EXACT) {
             /* But OP != EXACT!, so is EXACTFish */
            SSize_t l = STR_LEN(scan);
-           UV uc = *((U8*)STRING(scan));
-            SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
-                                                     separate code points */
             const U8 * s = (U8*)STRING(scan);
 
            /* Search for fixed substrings supports EXACT only. */
@@ -4524,7 +4521,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 scan_commit(pRExC_state, data, minlenp, is_inf);
            }
            if (UTF) {
-               uc = utf8_to_uvchr_buf(s, s + l, NULL);
                l = utf8_length(s, s + l);
            }
            if (unfolded_multi_char) {
@@ -4544,156 +4540,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                }
            }
 
-            if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
-                ssc_clear_locale(data->start_class);
-            }
+            if (flags & SCF_DO_STCLASS) {
+                SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
 
-            if (! UTF) {
-
-                /* We punt and assume can match anything if the node begins
-                 * with a multi-character fold.  Things are complicated.  For
-                 * example, /ffi/i could match any of:
-                 *  "\N{LATIN SMALL LIGATURE FFI}"
-                 *  "\N{LATIN SMALL LIGATURE FF}I"
-                 *  "F\N{LATIN SMALL LIGATURE FI}"
-                 *  plus several other things; and making sure we have all the
-                 *  possibilities is hard. */
-                if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
-                    EXACTF_invlist =
-                             _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
-                }
-                else {
-
-                    /* Any Latin1 range character can potentially match any
-                     * other depending on the locale */
-                    if (OP(scan) == EXACTFL) {
-                        _invlist_union(EXACTF_invlist, PL_Latin1,
-                                                              &EXACTF_invlist);
-                    }
-                    else {
-                        /* But otherwise, it matches at least itself.  We can
-                         * quickly tell if it has a distinct fold, and if so,
-                         * it matches that as well */
-                        EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
-                        if (IS_IN_SOME_FOLD_L1(uc)) {
-                            EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
-                                                           PL_fold_latin1[uc]);
-                        }
-                    }
-
-                    /* Some characters match above-Latin1 ones under /i.  This
-                     * is true of EXACTFL ones when the locale is UTF-8 */
-                    if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
-                        && (! isASCII(uc) || (OP(scan) != EXACTFA
-                                            && OP(scan) != EXACTFA_NO_TRIE)))
-                    {
-                        add_above_Latin1_folds(pRExC_state,
-                                               (U8) uc,
-                                               &EXACTF_invlist);
-                    }
-                }
-            }
-            else {  /* Pattern is UTF-8 */
-                U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
-                STRLEN foldlen = UTF8SKIP(s);
-                const U8* e = s + STR_LEN(scan);
-                SV** listp;
-
-                /* The only code points that aren't folded in a UTF EXACTFish
-                 * node are are the problematic ones in EXACTFL nodes */
-                if (OP(scan) == EXACTFL
-                    && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
-                {
-                    /* We need to check for the possibility that this EXACTFL
-                     * node begins with a multi-char fold.  Therefore we fold
-                     * the first few characters of it so that we can make that
-                     * check */
-                    U8 *d = folded;
-                    int i;
-
-                    for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
-                        if (isASCII(*s)) {
-                            *(d++) = (U8) toFOLD(*s);
-                            s++;
-                        }
-                        else {
-                            STRLEN len;
-                            to_utf8_fold(s, d, &len);
-                            d += len;
-                            s += UTF8SKIP(s);
-                        }
-                    }
-
-                    /* And set up so the code below that looks in this folded
-                     * buffer instead of the node's string */
-                    e = d;
-                    foldlen = UTF8SKIP(folded);
-                    s = folded;
-                }
-
-                /* When we reach here 's' points to the fold of the first
-                 * character(s) of the node; and 'e' points to far enough along
-                 * the folded string to be just past any possible multi-char
-                 * fold. 'foldlen' is the length in bytes of the first
-                 * character in 's'
-                 *
-                 * Unlike the non-UTF-8 case, the macro for determining if a
-                 * string is a multi-char fold requires all the characters to
-                 * already be folded.  This is because of all the complications
-                 * if not.  Note that they are folded anyway, except in EXACTFL
-                 * nodes.  Like the non-UTF case above, we punt if the node
-                 * begins with a multi-char fold  */
-
-                if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
-                    EXACTF_invlist =
-                             _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
+                assert(EXACTF_invlist);
+                if (flags & SCF_DO_STCLASS_AND) {
+                    if (OP(scan) != EXACTFL)
+                        ssc_clear_locale(data->start_class);
+                    ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
+                    ANYOF_POSIXL_ZERO(data->start_class);
+                    ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
                 }
-                else {  /* Single char fold */
-
-                    /* It matches all the things that fold to it, which are
-                     * found in PL_utf8_foldclosures (including itself) */
-                    EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
-                    if (! PL_utf8_foldclosures) {
-                        _load_PL_utf8_foldclosures();
-                    }
-                    if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                        (char *) s, foldlen, FALSE)))
-                    {
-                        AV* list = (AV*) *listp;
-                        IV k;
-                        for (k = 0; k <= av_tindex(list); k++) {
-                            SV** c_p = av_fetch(list, k, FALSE);
-                            UV c;
-                            assert(c_p);
-
-                            c = SvUV(*c_p);
-
-                            /* /aa doesn't allow folds between ASCII and non- */
-                            if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
-                                && isASCII(c) != isASCII(uc))
-                            {
-                                continue;
-                            }
+                else {  /* SCF_DO_STCLASS_OR */
+                    ssc_union(data->start_class, EXACTF_invlist, FALSE);
+                    ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
 
-                            EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
-                        }
-                    }
+                    /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
+                    ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
                 }
+                flags &= ~SCF_DO_STCLASS;
+                SvREFCNT_dec(EXACTF_invlist);
             }
-           if (flags & SCF_DO_STCLASS_AND) {
-                ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
-                ANYOF_POSIXL_ZERO(data->start_class);
-                ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
-           }
-           else if (flags & SCF_DO_STCLASS_OR) {
-                ssc_union(data->start_class, EXACTF_invlist, FALSE);
-               ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
-
-                /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
-                ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
-           }
-           flags &= ~SCF_DO_STCLASS;
-            SvREFCNT_dec(EXACTF_invlist);
        }
        else if (REGNODE_VARIES(OP(scan))) {
            SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
@@ -9501,6 +9368,152 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
 }
 #endif
 
+/*
+ * As best we can, determine the characters that can match the start of
+ * the given EXACTF-ish node.
+ *
+ * Returns the invlist as a new SV*; it is the caller's responsibility to
+ * call SvREFCNT_dec() when done with it.
+ */
+STATIC SV*
+S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
+{
+    const U8 * s = (U8*)STRING(node);
+    SSize_t bytelen = STR_LEN(node);
+    UV uc;
+    /* Start out big enough for 2 separate code points */
+    SV* invlist = _new_invlist(4);
+
+    PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
+
+    if (! UTF) {
+        uc = *s;
+
+        /* We punt and assume can match anything if the node begins
+         * with a multi-character fold.  Things are complicated.  For
+         * example, /ffi/i could match any of:
+         *  "\N{LATIN SMALL LIGATURE FFI}"
+         *  "\N{LATIN SMALL LIGATURE FF}I"
+         *  "F\N{LATIN SMALL LIGATURE FI}"
+         *  plus several other things; and making sure we have all the
+         *  possibilities is hard. */
+        if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
+            invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
+        }
+        else {
+            /* Any Latin1 range character can potentially match any
+             * other depending on the locale */
+            if (OP(node) == EXACTFL) {
+                _invlist_union(invlist, PL_Latin1, &invlist);
+            }
+            else {
+                /* But otherwise, it matches at least itself.  We can
+                 * quickly tell if it has a distinct fold, and if so,
+                 * it matches that as well */
+                invlist = add_cp_to_invlist(invlist, uc);
+                if (IS_IN_SOME_FOLD_L1(uc))
+                    invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
+            }
+
+            /* Some characters match above-Latin1 ones under /i.  This
+             * is true of EXACTFL ones when the locale is UTF-8 */
+            if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
+                && (! isASCII(uc) || (OP(node) != EXACTFA
+                                    && OP(node) != EXACTFA_NO_TRIE)))
+            {
+                add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
+            }
+        }
+    }
+    else {  /* Pattern is UTF-8 */
+        U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
+        STRLEN foldlen = UTF8SKIP(s);
+        const U8* e = s + bytelen;
+        SV** listp;
+
+        uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+
+        /* The only code points that aren't folded in a UTF EXACTFish
+         * node are are the problematic ones in EXACTFL nodes */
+        if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
+            /* We need to check for the possibility that this EXACTFL
+             * node begins with a multi-char fold.  Therefore we fold
+             * the first few characters of it so that we can make that
+             * check */
+            U8 *d = folded;
+            int i;
+
+            for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
+                if (isASCII(*s)) {
+                    *(d++) = (U8) toFOLD(*s);
+                    s++;
+                }
+                else {
+                    STRLEN len;
+                    to_utf8_fold(s, d, &len);
+                    d += len;
+                    s += UTF8SKIP(s);
+                }
+            }
+
+            /* And set up so the code below that looks in this folded
+             * buffer instead of the node's string */
+            e = d;
+            foldlen = UTF8SKIP(folded);
+            s = folded;
+        }
+
+        /* When we reach here 's' points to the fold of the first
+         * character(s) of the node; and 'e' points to far enough along
+         * the folded string to be just past any possible multi-char
+         * fold. 'foldlen' is the length in bytes of the first
+         * character in 's'
+         *
+         * Unlike the non-UTF-8 case, the macro for determining if a
+         * string is a multi-char fold requires all the characters to
+         * already be folded.  This is because of all the complications
+         * if not.  Note that they are folded anyway, except in EXACTFL
+         * nodes.  Like the non-UTF case above, we punt if the node
+         * begins with a multi-char fold  */
+
+        if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
+            invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
+        }
+        else {  /* Single char fold */
+
+            /* It matches all the things that fold to it, which are
+             * found in PL_utf8_foldclosures (including itself) */
+            invlist = add_cp_to_invlist(invlist, uc);
+            if (! PL_utf8_foldclosures)
+                _load_PL_utf8_foldclosures();
+            if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                (char *) s, foldlen, FALSE)))
+            {
+                AV* list = (AV*) *listp;
+                IV k;
+                for (k = 0; k <= av_tindex(list); k++) {
+                    SV** c_p = av_fetch(list, k, FALSE);
+                    UV c;
+                    assert(c_p);
+
+                    c = SvUV(*c_p);
+
+                    /* /aa doesn't allow folds between ASCII and non- */
+                    if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
+                        && isASCII(c) != isASCII(uc))
+                    {
+                        continue;
+                    }
+
+                    invlist = add_cp_to_invlist(invlist, c);
+                }
+            }
+        }
+    }
+
+    return invlist;
+}
+
 #undef HEADER_LENGTH
 #undef TO_INTERNAL_SIZE
 #undef FROM_INTERNAL_SIZE