rework how the trie logic handles the newer EXACT nodetypes
authorYves Orton <demerphq@gmail.com>
Sun, 19 Feb 2012 20:32:05 +0000 (21:32 +0100)
committerYves Orton <demerphq@gmail.com>
Sat, 3 Mar 2012 12:27:28 +0000 (13:27 +0100)
This cleans up and simplifies and extends how the trie
logic interacts with the new node types. This change ultimately
makes the EXACTFU, EXACTFU_SS, EXACTFU_NO_TRIE (renamed to
EXACTFU_TRICKYFOLD) work properly with the trie engine regardless
of whether the string is utf8 or latin1.

This patch depends on the following:

    EXACT              => utf8 or "binary" text

    EXACTFU            => either pre-folded utf8, or latin1 that has to be folded as though it was utf8
    EXACTFU_SS         => special case of EXACTFU to handle \xDF/ss (affects latin1 treatment)
    EXACTFU_TRICKYFOLD => special case of EXACTFU to handle tricky non-latin1 fold rules

    EXACTF             => "old style fold logic" untriable nodetype
    EXACTFA            => (currently) untriable nodetype
    EXACTFL            => (currently) untriable nodetype

See the comments in regcomp.sym for these fold types.

This patch involves a number of distinct, but related parts. Starting
from compilation:

* Simplify how we detect a triable sequence given the new nodetypes,
  this also probably fixed some "bugs" in how we detected certain
  sequences, like /||foo|bar/.

* Simplify how we read EXACTFU nodes under utf8 by removing the now
  redundant folding logic (EXACTFU nodes under utf8 are prefolded).
  Also extend this logic to handle latin1 patterns properly (in
  conjunction with  other changes)

* Part of the problems associated with EXACTFU_SS and EXACTFU_TRICKYFOLD
  have to do with how the trie logic interacts with the minlen logic.
  This change handles both by pessimising the minlen when encounting
  these nodetypes. One observation is that the minlen logic is basically
  broken, and works only because it conflates bytes and codepoints in
  such a way that we more or less always get a value small enough that things work out
  anyway. Fixing that is properly is the job of another patch.

* Part of the problem of doing folding under unicode rules is that
  there are a lot of foldings possible, some with strange rules. This
  means that the bitmap logic does not work correctly in all cases,
  as we currently do not have any way to populate it properly.
  So this patch disables the bitmap entirely when folding is involved
  until that is fixed.

The end result of this is: we can TRIE/AHOCORASICK any sequence of
EXACT, or EXACTFU (ish) nodes, regardless of utf8 or not, but we disable
the bitmap when folding.

A note for follow up relating to this patch is that the way EXACTFU_XXX
nodes are currently dealt with we wont build the "maximal" trie because
of their presence, instead creating a "jumptrie" consisting of either a
leading EXACTFU node followed by a EXACTFU_XXX node, or vice versa. We
should eventually address that.

embed.fnc
embed.h
proto.h
regcomp.c
regcomp.sym
regexec.c
regnodes.h
t/re/fold_grind.t

index 5c380ff..c549dc9 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -604,7 +604,9 @@ Ap  |UV     |to_uni_upper   |UV c|NN U8 *p|NN STRLEN *lenp
 Ap     |UV     |to_uni_title   |UV c|NN U8 *p|NN STRLEN *lenp
 #ifdef PERL_IN_UTF8_C
 sR     |U8     |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
-p      |UV     |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
+#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+EXp        |UV        |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
 #endif
 #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
 p      |UV     |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
diff --git a/embed.h b/embed.h
index 9fdf91b..1d1e598 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define reghop4                        S_reghop4
 #    endif
 #  endif
+#  if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+#define _to_fold_latin1(a,b,c,d)       Perl__to_fold_latin1(aTHX_ a,b,c,d)
+#  endif
 #  if defined(PERL_OLD_COPY_ON_WRITE)
 #define sv_setsv_cow(a,b)      Perl_sv_setsv_cow(aTHX_ a,b)
 #  endif
 #define isa_lookup(a,b,c,d)    S_isa_lookup(aTHX_ a,b,c,d)
 #  endif
 #  if defined(PERL_IN_UTF8_C)
-#define _to_fold_latin1(a,b,c,d)       Perl__to_fold_latin1(aTHX_ a,b,c,d)
 #define check_locale_boundary_crossing(a,b,c,d)        S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
 #define is_utf8_char_slow      S_is_utf8_char_slow
 #define is_utf8_common(a,b,c)  S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index dd3fd58..b811e6b 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -7114,12 +7114,6 @@ STATIC bool      S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
 
 #endif
 #if defined(PERL_IN_UTF8_C)
-PERL_CALLCONV UV       Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
-                       __attribute__nonnull__(pTHX_2)
-                       __attribute__nonnull__(pTHX_3);
-#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1       \
-       assert(p); assert(lenp)
-
 STATIC UV      S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1)
@@ -7165,6 +7159,14 @@ PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *le
 #define PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1        \
        assert(p); assert(lenp)
 
+#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
+PERL_CALLCONV UV       Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
+                       __attribute__nonnull__(pTHX_2)
+                       __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1       \
+       assert(p); assert(lenp)
+
 #endif
 #if defined(PERL_IN_UTIL_C)
 STATIC bool    S_ckwarn_common(pTHX_ U32 w);
index 7a3433a..e3da6e9 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1366,44 +1366,47 @@ is the recommended Unicode-aware way of saying
     *(d++) = uv;
 */
 
-#define TRIE_STORE_REVCHAR                                                 \
+#define TRIE_STORE_REVCHAR(val)                                            \
     STMT_START {                                                           \
        if (UTF) {                                                         \
-           SV *zlopp = newSV(2);                                          \
+            SV *zlopp = newSV(7); /* XXX: optimize me */                   \
            unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-           unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+            unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
            SvCUR_set(zlopp, kapow - flrbbbbb);                            \
            SvPOK_on(zlopp);                                               \
            SvUTF8_on(zlopp);                                              \
            av_push(revcharmap, zlopp);                                    \
        } else {                                                           \
-           char ooooff = (char)uvc;                                               \
+            char ooooff = (char)val;                                           \
            av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
        }                                                                  \
         } STMT_END
 
-#define TRIE_READ_CHAR STMT_START {                                           \
-    wordlen++;                                                                \
-    if ( UTF ) {                                                              \
-       if ( folder ) {                                                       \
-           if ( foldlen > 0 ) {                                              \
-              uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
-              foldlen -= len;                                                \
-              scan += len;                                                   \
-              len = 0;                                                       \
-           } else {                                                          \
-               len = UTF8SKIP(uc);\
-               uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
-               foldlen -= UNISKIP( uvc );                                    \
-               scan = foldbuf + UNISKIP( uvc );                              \
-           }                                                                 \
-       } else {                                                              \
-           uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
-       }                                                                     \
-    } else {                                                                  \
-       uvc = (U32)*uc;                                                       \
-       len = 1;                                                              \
-    }                                                                         \
+#define TRIE_READ_CHAR STMT_START {                                                     \
+    wordlen++;                                                                          \
+    if ( UTF ) {                                                                        \
+        /* if it is UTF then it is either already folded, or does not need folding */   \
+        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
+    }                                                                                   \
+    else if (folder == PL_fold_latin1) {                                                \
+        /* if we use this folder we have to obey unicode rules on latin-1 data */       \
+        if ( foldlen > 0 ) {                                                            \
+           uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
+           foldlen -= len;                                                              \
+           scan += len;                                                                 \
+           len = 0;                                                                     \
+        } else {                                                                        \
+            len = 1;                                                                    \
+            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
+            skiplen = UNISKIP(uvc);                                                     \
+            foldlen -= skiplen;                                                         \
+            scan = foldbuf + skiplen;                                                   \
+        }                                                                               \
+    } else {                                                                            \
+        /* raw data, will be folded later if needed */                                  \
+        uvc = (U32)*uc;                                                                 \
+        len = 1;                                                                        \
+    }                                                                                   \
 } STMT_END
 
 
@@ -1522,10 +1525,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     switch (flags) {
        case EXACT: break;
        case EXACTFA:
+        case EXACTFU_SS:
+        case EXACTFU_TRICKYFOLD:
        case EXACTFU: folder = PL_fold_latin1; break;
        case EXACTF:  folder = PL_fold; break;
        case EXACTFL: folder = PL_fold_locale; break;
-        default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
+        default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
     }
 
     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
@@ -1534,7 +1539,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     trie->wordcount = word_count;
     RExC_rxi->data->data[ data_slot ] = (void*)trie;
     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
-    if (!(UTF && folder))
+    if (flags == EXACT)
        trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -1594,6 +1599,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         const U8 * const e  = uc + STR_LEN( noper );
         STRLEN foldlen = 0;
         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+        STRLEN skiplen = 0;
         const U8 *scan = (U8*)NULL;
         U32 wordlen      = 0;         /* required init */
         STRLEN chars = 0;
@@ -1603,28 +1609,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
             trie->minlen= 0;
             continue;
         }
-        if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+        if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
                                           regardless of encoding */
-
+            if (OP( noper ) == EXACTFU_SS) {
+                /* false positives are ok, so just set this */
+                TRIE_BITMAP_SET(trie,0xDF);
+            }
+        }
         for ( ; uc < e ; uc += len ) {
             TRIE_CHARCOUNT(trie)++;
             TRIE_READ_CHAR;
             chars++;
             if ( uvc < 256 ) {
+                if ( folder ) {
+                    U8 folded= folder[ (U8) uvc ];
+                    if ( !trie->charmap[ folded ] ) {
+                        trie->charmap[ folded ]=( ++trie->uniquecharcount );
+                        TRIE_STORE_REVCHAR( folded );
+                    }
+                }
                 if ( !trie->charmap[ uvc ] ) {
                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
-                    if ( folder )
-                        trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
-                    TRIE_STORE_REVCHAR;
+                    TRIE_STORE_REVCHAR( uvc );
                 }
                 if ( set_bit ) {
                    /* store the codepoint in the bitmap, and its folded
                     * equivalent. */
-                    TRIE_BITMAP_SET(trie,uvc);
+                    TRIE_BITMAP_SET(trie, uvc);
 
                    /* store the folded codepoint */
-                   if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+                    if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
 
                    if ( !UTF ) {
                        /* store first byte of utf8 representation of
@@ -1647,17 +1662,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 
                 if ( !SvTRUE( *svpp ) ) {
                     sv_setiv( *svpp, ++trie->uniquecharcount );
-                    TRIE_STORE_REVCHAR;
+                    TRIE_STORE_REVCHAR(uvc);
                 }
             }
         }
         if( cur == first ) {
-            trie->minlen=chars;
-            trie->maxlen=chars;
+            trie->minlen = chars;
+            trie->maxlen = chars;
         } else if (chars < trie->minlen) {
-            trie->minlen=chars;
+            trie->minlen = chars;
         } else if (chars > trie->maxlen) {
-            trie->maxlen=chars;
+            trie->maxlen = chars;
+        }
+        if (OP( noper ) == EXACTFU_SS) {
+            /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
+           if (trie->minlen > 1)
+                trie->minlen= 1;
+        }
+       if (OP( noper ) == EXACTFU_TRICKYFOLD) {
+           /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" 
+            *                - We assume that any such sequence might match a 2 byte string */
+            if (trie->minlen > 2 )
+                trie->minlen= 2;
         }
 
     } /* end first pass */
@@ -1730,6 +1756,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
            STRLEN foldlen   = 0;         /* required init */
             U32 wordlen      = 0;         /* required init */
            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+            STRLEN skiplen   = 0;
 
             if (OP(noper) != NOTHING) {
                 for ( ; uc < e ; uc += len ) {
@@ -1930,8 +1957,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 
             STRLEN foldlen   = 0;         /* required init */
             U32 wordlen      = 0;         /* required init */
+            STRLEN skiplen   = 0;
             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
 
+
             if ( OP(noper) != NOTHING ) {
                 for ( ; uc < e ; uc += len ) {
 
@@ -2568,7 +2597,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  *      optimizer doesn't reject these possibilities based on size constraints.
  * 2)   These sequences are not currently correctly handled by the trie code
  *      either, so it changes the joined node type to ops that are not handled
- *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
  * 3)   This is sufficient for the two Greek sequences (described below), but
  *      the one involving the Sharp s (\xDF) needs more.  The node type
  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -2823,7 +2852,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                         * is.  (I (khw) think it doesn't matter in regexec.c
                         * for UTF patterns, but no need to change it */
                        if (OP(scan) == EXACTFU) {
-                           OP(scan) = EXACTFU_NO_TRIE;
+                           OP(scan) = EXACTFU_TRICKYFOLD;
                        }
                        s += 6; /* We already know what this sequence is.  Skip
                                   the rest of it */
@@ -3185,7 +3214,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         regnode *first = (regnode *)NULL;
                         regnode *last = (regnode *)NULL;
                         regnode *tail = scan;
-                        U8 optype = 0;
+                        U8 trietype = 0;
                         U32 count=0;
 
 #ifdef DEBUGGING
@@ -3216,32 +3245,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         
                         /*
 
-                           step through the branches, cur represents each
-                           branch, noper is the first thing to be matched
-                           as part of that branch and noper_next is the
-                           regnext() of that node. if noper is an EXACT
-                           and noper_next is the same as scan (our current
-                           position in the regex) then the EXACT branch is
-                           a possible optimization target. Once we have
-                           two or more consecutive such branches we can
-                           create a trie of the EXACT's contents and stich
-                           it in place. If the sequence represents all of
-                           the branches we eliminate the whole thing and
-                           replace it with a single TRIE. If it is a
-                           subsequence then we need to stitch it in. This
-                           means the first branch has to remain, and needs
-                           to be repointed at the item on the branch chain
-                           following the last branch optimized. This could
-                           be either a BRANCH, in which case the
-                           subsequence is internal, or it could be the
-                           item following the branch sequence in which
-                           case the subsequence is at the end.
+                            Step through the branches
+                                cur represents each branch,
+                                noper is the first thing to be matched as part of that branch
+                                noper_next is the regnext() of that node.
+
+                            We normally handle a case like this /FOO[xyz]|BAR[pqr]/
+                            via a "jump trie" but we also support building with NOJUMPTRIE,
+                            which restricts the trie logic to structures like /FOO|BAR/.
+
+                            If noper is a trieable nodetype then the branch is a possible optimization
+                            target. If we are building under NOJUMPTRIE then we require that noper_next
+                            is the same as scan (our current position in the regex program).
+
+                            Once we have two or more consecutive such branches we can create a
+                            trie of the EXACT's contents and stitch it in place into the program.
+
+                            If the sequence represents all of the branches in the alternation we
+                            replace the entire thing with a single TRIE node.
+
+                            Otherwise when it is a subsequence we need to stitch it in place and
+                            replace only the relevant branches. This means the first branch has
+                            to remain as it is used by the alternation logic, and its next pointer,
+                            and needs to be repointed at the item on the branch chain following
+                            the last branch we have optimized away.
+
+                            This could be either a BRANCH, in which case the subsequence is internal,
+                            or it could be the item following the branch sequence in which case the
+                            subsequence is at the end (which does not necessarily mean the first node
+                            is the start of the alternation).
+
+                            TRIE_TYPE(X) is a define which maps the optype to a trietype.
+
+                                optype          |  trietype
+                                ----------------+-----------
+                                NOTHING         | NOTHING
+                                EXACT           | EXACT
+                                EXACTFU         | EXACTFU
+                                EXACTFU_SS      | EXACTFU
+                                EXACTFU_TRICKYFOLD | EXACTFU
+                                EXACTFA         | 0
+
 
                         */
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
+                       ( EXACT == (X) )   ? EXACT :        \
+                       ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
+                       0 )
 
                         /* dont use tail as the end marker for this traverse */
                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
                             regnode * const noper = NEXTOPER( cur );
+                            U8 noper_type = OP( noper );
+                            U8 noper_trietype = TRIE_TYPE( noper_type );
 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
                             regnode * const noper_next = regnext( noper );
 #endif
@@ -3263,59 +3319,68 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
                             });
-                            if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
-                                         : PL_regkind[ OP( noper ) ] == EXACT )
-                                  || OP(noper) == NOTHING )
+
+                            /* Is noper a trieable nodetype that can be merged with the
+                             * current trie (if there is one)? */
+                            if ( noper_trietype
+                                  &&
+                                  (
+                                        ( noper_trietype == NOTHING )
+                                        ||
+                                        ( trietype == NOTHING )
+                                        ||
+                                        ( trietype == noper_trietype )
+                                  )
 #ifdef NOJUMPTRIE
                                   && noper_next == tail
 #endif
                                   && count < U16_MAX)
                             {
+                                /* Handle mergable triable node
+                                 * Either we are the first node in a new trieable sequence,
+                                 * in which case we do some bookkeeping, otherwise we update
+                                 * the end pointer. */
                                 count++;
-                                if ( !first || optype == NOTHING ) {
-                                    if (!first) first = cur;
-                                    optype = OP( noper );
+                                if ( !first ) {
+                                    first = cur;
+                                    trietype = noper_trietype;
                                 } else {
+                                    if ( trietype == NOTHING )
+                                        trietype = noper_trietype;
                                     last = cur;
                                 }
-                            } else {
-/* 
-    Currently the trie logic handles case insensitive matching properly only
-    when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
-    semantics).
-
-    If/when this is fixed the following define can be swapped
-    in below to fully enable trie logic.
-
-#define TRIE_TYPE_IS_SAFE 1
-
-Note that join_exact() assumes that the other types of EXACTFish nodes are not
-used in tries, so that would have to be updated if this changed
-
-*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
-
-                                if ( last && TRIE_TYPE_IS_SAFE ) {
+                            } /* end handle mergable triable node */
+                            else {
+                                /* handle unmergable node -
+                                 * noper may either be a triable node which can not be tried
+                                 * together with the current trie, or a non triable node */
+                                if ( last && trietype != NOTHING ) {
+                                    /* if last is set then we have found at least two triable branch
+                                     * sequences in a row of a similar trietype so we can turn them
+                                     * into a trie */
                                     make_trie( pRExC_state, 
                                             startbranch, first, cur, tail, count, 
-                                            optype, depth+1 );
+                                            trietype, depth+1 );
+                                    last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
                                 }
-                                if ( PL_regkind[ OP( noper ) ] == EXACT
+                                if ( noper_trietype
 #ifdef NOJUMPTRIE
                                      && noper_next == tail
 #endif
                                 ){
+                                    /* noper is triable, so we can start a new trie sequence */
                                     count = 1;
                                     first = cur;
-                                    optype = OP( noper );
-                                } else {
+                                    trietype = noper_trietype;
+                                } else if (first) {
+                                    /* if we already saw a first but the current node is not triable then we have
+                                     * to reset the first information. */
                                     count = 0;
                                     first = NULL;
-                                    optype = 0;
+                                    trietype = 0;
                                 }
-                                last = NULL;
-                            }
-                        }
+                            } /* end handle unmergable node */
+                        } /* loop over branches */
                         DEBUG_OPTIMISE_r({
                             regprop(RExC_rx, mysv, cur);
                             PerlIO_printf( Perl_debug_log,
@@ -3323,9 +3388,11 @@ used in tries, so that would have to be updated if this changed
                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
 
                         });
-                        
-                        if ( last && TRIE_TYPE_IS_SAFE ) {
-                            made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
+                        if ( last && trietype != NOTHING ) {
+                            /* the last branch of the sequence was part of a trie,
+                             * so we have to construct it here outside of the loop
+                             */
+                            made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
 #ifdef TRIE_STUDY_OPT
                             if ( ((made == MADE_EXACT_TRIE && 
                                  startbranch == first) 
@@ -3339,8 +3406,8 @@ used in tries, so that would have to be updated if this changed
                                 }
                             }
 #endif
-                        }
-                    }
+                        } /* end if ( last) */
+                    } /* TRIE_MAXBUF is non zero */
                     
                 } /* do trie */
                 
@@ -12034,7 +12101,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
                 case EXACTFA:
                 case EXACTFU:
                 case EXACTFU_SS:
-                case EXACTFU_NO_TRIE:
+                case EXACTFU_TRICKYFOLD:
                 case EXACTFL:
                         if( exact == PSEUDO )
                             exact= OP(scan);
index f0ee701..e6be6ac 100644 (file)
@@ -92,14 +92,14 @@ BRANCH      BRANCH,     node 0 V  ; Match this alternative, or the next...
 # not used
 BACK        BACK,       no 0 V    ; Match "", "next" ptr points backward.
 
-#*Literals
+#*Literals - NOTE the relative ordering of these types is important do not change it
 
 EXACT       EXACT,      str       ; Match this string (preceded by length).
 EXACTF      EXACT,      str       ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).
 EXACTFL     EXACT,      str       ; Match this string (not guaranteed to be folded) using /il rules (w/len).
 EXACTFU     EXACT,      str      ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
 EXACTFU_SS  EXACT,      str      ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).
-EXACTFU_NO_TRIE EXACT,  str      ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
+EXACTFU_TRICKYFOLD EXACT,  str   ; Match this folded UTF-8 string using /iu rules, but don't generate a trie for it
 EXACTFA     EXACT,      str      ; Match this string (not guaranteed to be folded) using /iaa rules (w/len).
 
 #*Do nothing types
index 76eb7f9..8ccb6f7 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
    we don't need this definition. */
 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
-#define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
+#define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 
 #else
 /* ... so we use this as its faster. */
 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
-#define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 
@@ -1187,58 +1187,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 
 #define DECL_TRIE_TYPE(scan) \
     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
-                   trie_type = (scan->flags != EXACT) \
-                             ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
-                              : (utf8_target ? trie_utf8 : trie_plain)
-
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
-uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
-    switch (trie_type) {                                                    \
-    case trie_utf8_fold:                                                    \
-       if ( foldlen>0 ) {                                                  \
-           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
-           foldlen -= len;                                                 \
-           uscan += len;                                                   \
-           len=0;                                                          \
-       } else {                                                            \
-           uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen );             \
-           len = UTF8SKIP(uc); \
-           foldlen -= UNISKIP( uvc );                                      \
-           uscan = foldbuf + UNISKIP( uvc );                               \
-       }                                                                   \
-       break;                                                              \
-    case trie_latin_utf8_fold:                                              \
-       if ( foldlen>0 ) {                                                  \
-           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
-           foldlen -= len;                                                 \
-           uscan += len;                                                   \
-           len=0;                                                          \
-       } else {                                                            \
-           len = 1;                                                        \
-           uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
-           foldlen -= UNISKIP( uvc );                                      \
-           uscan = foldbuf + UNISKIP( uvc );                               \
-       }                                                                   \
-       break;                                                              \
-    case trie_utf8:                                                         \
-       uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
-       break;                                                              \
-    case trie_plain:                                                        \
-       uvc = (UV)*uc;                                                      \
-       len = 1;                                                            \
-    }                                                                       \
-    if (uvc < 256) {                                                        \
-       charid = trie->charmap[ uvc ];                                      \
-    }                                                                       \
-    else {                                                                  \
-       charid = 0;                                                         \
-       if (widecharmap) {                                                  \
-           SV** const svpp = hv_fetch(widecharmap,                         \
-                       (char*)&uvc, sizeof(UV), 0);                        \
-           if (svpp)                                                       \
-               charid = (U16)SvIV(*svpp);                                  \
-       }                                                                   \
-    }                                                                       \
+                    trie_type = ((scan->flags == EXACT) \
+                              ? (utf8_target ? trie_utf8 : trie_plain) \
+                              : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
+
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
+uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
+    STRLEN skiplen;                                                                 \
+    switch (trie_type) {                                                            \
+    case trie_utf8_fold:                                                            \
+        if ( foldlen>0 ) {                                                          \
+            uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+            foldlen -= len;                                                         \
+            uscan += len;                                                           \
+            len=0;                                                                  \
+        } else {                                                                    \
+            uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
+            len = UTF8SKIP(uc);                                                     \
+            skiplen = UNISKIP( uvc );                                               \
+            foldlen -= skiplen;                                                     \
+            uscan = foldbuf + skiplen;                                              \
+        }                                                                           \
+        break;                                                                      \
+    case trie_latin_utf8_fold:                                                      \
+        if ( foldlen>0 ) {                                                          \
+            uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+            foldlen -= len;                                                         \
+            uscan += len;                                                           \
+            len=0;                                                                  \
+        } else {                                                                    \
+            len = 1;                                                                \
+            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
+            skiplen = UNISKIP( uvc );                                               \
+            foldlen -= skiplen;                                                     \
+            uscan = foldbuf + skiplen;                                              \
+        }                                                                           \
+        break;                                                                      \
+    case trie_utf8:                                                                 \
+        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
+        break;                                                                      \
+    case trie_plain:                                                                \
+        uvc = (UV)*uc;                                                              \
+        len = 1;                                                                    \
+    }                                                                               \
+    if (uvc < 256) {                                                                \
+        charid = trie->charmap[ uvc ];                                              \
+    }                                                                               \
+    else {                                                                          \
+        charid = 0;                                                                 \
+        if (widecharmap) {                                                          \
+            SV** const svpp = hv_fetch(widecharmap,                                 \
+                        (char*)&uvc, sizeof(UV), 0);                                \
+            if (svpp)                                                               \
+                charid = (U16)SvIV(*svpp);                                          \
+        }                                                                           \
+    }                                                                               \
 } STMT_END
 
 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
@@ -1490,7 +1493,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            }
            goto do_exactf_utf8;
 
-       case EXACTFU_NO_TRIE:
+       case EXACTFU_TRICKYFOLD:
        case EXACTFU:
            if (UTF_PATTERN || utf8_target) {
                utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -3267,16 +3270,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             /* In this case the charclass data is available inline so
                we can fail fast without a lot of extra overhead. 
              */
-            if (scan->flags == EXACT || !utf8_target) {
-                if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
-                    DEBUG_EXECUTE_r(
-                        PerlIO_printf(Perl_debug_log,
-                                 "%*s  %sfailed to match trie start class...%s\n",
-                                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
-                    );
-                    sayNO_SILENT;
-                    /* NOTREACHED */
-                }                      
+            if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
+                DEBUG_EXECUTE_r(
+                    PerlIO_printf(Perl_debug_log,
+                              "%*s  %sfailed to match trie start class...%s\n",
+                              REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+                );
+                sayNO_SILENT;
+                /* NOTREACHED */
             }
             /* FALL THROUGH */
        case TRIE:
@@ -3334,9 +3335,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
                 U32 state = trie->startstate;
 
-               if (trie->bitmap && trie_type != trie_utf8_fold &&
-                   !TRIE_BITMAP_TEST(trie,*locinput)
-               ) {
+                if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
                    if (trie->states[ state ].wordnum) {
                         DEBUG_EXECUTE_r(
                             PerlIO_printf(Perl_debug_log,
@@ -3671,7 +3670,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            goto do_exactf;
 
        case EXACTFU_SS:
-       case EXACTFU_NO_TRIE:
+       case EXACTFU_TRICKYFOLD:
        case EXACTFU:
            folder = foldEQ_latin1;
            fold_array = PL_fold_latin1;
@@ -5085,7 +5084,7 @@ NULL
                            case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
                            case EXACTFA:
                            case EXACTFU_SS:
-                           case EXACTFU_NO_TRIE:
+                           case EXACTFU_TRICKYFOLD:
                            case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
                            case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
                            default: ST.c2 = ST.c1;
@@ -5241,7 +5240,7 @@ NULL
                            case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
                            case EXACTFA:
                            case EXACTFU_SS:
-                           case EXACTFU_NO_TRIE:
+                           case EXACTFU_TRICKYFOLD:
                            case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
                            case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
                            default: ST.c2 = ST.c1; break;
@@ -6035,7 +6034,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            goto do_exactf;
 
     case EXACTFU_SS:
-    case EXACTFU_NO_TRIE:
+    case EXACTFU_TRICKYFOLD:
     case EXACTFU:
        utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
 
@@ -6077,7 +6076,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            switch (OP(p)) {
                case EXACTF: folded = PL_fold[c]; break;
                case EXACTFA:
-               case EXACTFU_NO_TRIE:
+               case EXACTFU_TRICKYFOLD:
                case EXACTFU: folded = PL_fold_latin1[c]; break;
                case EXACTFL: folded = PL_fold_locale[c]; break;
                default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
index 9941593..5561243 100644 (file)
@@ -62,7 +62,7 @@
 #define        EXACTFL                 50      /* 0x32 Match this string (not guaranteed to be folded) using /il rules (w/len). */
 #define        EXACTFU                 51      /* 0x33 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
 #define        EXACTFU_SS              52      /* 0x34 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define        EXACTFU_NO_TRIE         53      /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
+#define        EXACTFU_TRICKYFOLD      53      /* 0x35 Match this folded UTF-8 string using /iu rules, but don't generate a trie for it */
 #define        EXACTFA                 54      /* 0x36 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
 #define        NOTHING                 55      /* 0x37 Match empty string. */
 #define        TAIL                    56      /* 0x38 Match empty string. Can jump here from outside. */
@@ -223,7 +223,7 @@ EXTCONST U8 PL_regkind[] = {
        EXACT,          /* EXACTFL                */
        EXACT,          /* EXACTFU                */
        EXACT,          /* EXACTFU_SS             */
-       EXACT,          /* EXACTFU_NO_TRIE        */
+       EXACT,          /* EXACTFU_TRICKYFOLD     */
        EXACT,          /* EXACTFA                */
        NOTHING,        /* NOTHING                */
        NOTHING,        /* TAIL                   */
@@ -384,7 +384,7 @@ static const U8 regarglen[] = {
        0,                                      /* EXACTFL      */
        0,                                      /* EXACTFU      */
        0,                                      /* EXACTFU_SS   */
-       0,                                      /* EXACTFU_NO_TRIE */
+       0,                                      /* EXACTFU_TRICKYFOLD */
        0,                                      /* EXACTFA      */
        0,                                      /* NOTHING      */
        0,                                      /* TAIL         */
@@ -502,7 +502,7 @@ static const char reg_off_by_arg[] = {
        0,      /* EXACTFL      */
        0,      /* EXACTFU      */
        0,      /* EXACTFU_SS   */
-       0,      /* EXACTFU_NO_TRIE */
+       0,      /* EXACTFU_TRICKYFOLD */
        0,      /* EXACTFA      */
        0,      /* NOTHING      */
        0,      /* TAIL         */
@@ -625,7 +625,7 @@ EXTCONST char * const PL_reg_name[] = {
        "EXACTFL",                      /* 0x32 */
        "EXACTFU",                      /* 0x33 */
        "EXACTFU_SS",                   /* 0x34 */
-       "EXACTFU_NO_TRIE",              /* 0x35 */
+       "EXACTFU_TRICKYFOLD",           /* 0x35 */
        "EXACTFA",                      /* 0x36 */
        "NOTHING",                      /* 0x37 */
        "TAIL",                         /* 0x38 */
index 98fc396..e2153e3 100644 (file)
@@ -735,7 +735,7 @@ foreach my $test (sort { numerically } keys %tests) {
                           if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
                             # Failed or debug; output the result
                             $count++;
-                            ok($res, $desc);
+                            ok($res, "test $count - $desc");
                           } else {
                             # Just count the test as passed
                             $okays++;