+
+ make_trie(startbranch,first,last,tail,flags)
+ startbranch: the first branch in the whole branch sequence
+ first : start branch of sequence of branch-exact nodes.
+ May be the same as startbranch
+ last : Thing following the last branch.
+ May be the same as tail.
+ tail : item following the branch sequence
+ flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
+
+Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
+
+A trie is an N'ary tree where the branches are determined by digital
+decomposition of the key. IE, at the root node you look up the 1st character and
+follow that branch repeat until you find the end of the branches. Nodes can be
+marked as "accepting" meaning they represent a complete word. Eg:
+
+ /he|she|his|hers/
+
+would convert into the following structure. Numbers represent states, letters
+following numbers represent valid transitions on the letter from that state, if
+the number is in square brackets it represents an accepting state, otherwise it
+will be in parenthesis.
+
+ +-h->+-e->[3]-+-r->(8)-+-s->[9]
+ | |
+ | (2)
+ | |
+ (1) +-i->(6)-+-s->[7]
+ |
+ +-s->(3)-+-h->(4)-+-e->[5]
+
+ Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
+
+This shows that when matching against the string 'hers' we will begin at state 1
+read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
+then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
+is also accepting. Thus we know that we can match both 'he' and 'hers' with a
+single traverse. We store a mapping from accepting to state to which word was
+matched, and then when we have multiple possibilities we try to complete the
+rest of the regex in the order in which they occured in the alternation.
+
+The only prior NFA like behaviour that would be changed by the TRIE support is
+the silent ignoring of duplicate alternations which are of the form:
+
+ / (DUPE|DUPE) X? (?{ ... }) Y /x
+
+Thus EVAL blocks follwing a trie may be called a different number of times with
+and without the optimisation. With the optimisations dupes will be silently
+ignored. This inconsistant behaviour of EVAL type nodes is well established as
+the following demonstrates:
+
+ 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
+
+which prints out 'word' three times, but
+
+ 'words'=~/(word|word|word)(?{ print $1 })S/
+
+which doesnt print it out at all. This is due to other optimisations kicking in.
+
+Example of what happens on a structural level:
+
+The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
+
+ 1: CURLYM[1] {1,32767}(18)
+ 5: BRANCH(8)
+ 6: EXACT <ac>(16)
+ 8: BRANCH(11)
+ 9: EXACT <ad>(16)
+ 11: BRANCH(14)
+ 12: EXACT <ab>(16)
+ 16: SUCCEED(0)
+ 17: NOTHING(18)
+ 18: END(0)
+
+This would be optimizable with startbranch=5, first=5, last=16, tail=16
+and should turn into:
+
+ 1: CURLYM[1] {1,32767}(18)
+ 5: TRIE(16)
+ [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
+ <ac>
+ <ad>
+ <ab>
+ 16: SUCCEED(0)
+ 17: NOTHING(18)
+ 18: END(0)
+
+Cases where tail != last would be like /(?foo|bar)baz/:
+
+ 1: BRANCH(4)
+ 2: EXACT <foo>(8)
+ 4: BRANCH(7)
+ 5: EXACT <bar>(8)
+ 7: TAIL(8)
+ 8: EXACT <baz>(10)
+ 10: END(0)
+
+which would be optimizable with startbranch=1, first=1, last=7, tail=8
+and would end up looking like:
+
+ 1: TRIE(8)
+ [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
+ <foo>
+ <bar>
+ 7: TAIL(8)
+ 8: EXACT <baz>(10)
+ 10: END(0)
+
+*/
+
+#define TRIE_DEBUG_CHAR \
+ DEBUG_TRIE_COMPILE_r({ \
+ SV *tmp; \
+ if ( UTF ) { \
+ tmp = newSVpv( "", 0 ); \
+ pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
+ } else { \
+ tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
+ } \
+ av_push( trie->revcharmap, tmp ); \
+ })
+
+#define TRIE_READ_CHAR STMT_START { \
+ if ( UTF ) { \
+ if ( folder ) { \
+ if ( foldlen > 0 ) { \
+ uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ scan += len; \
+ len = 0; \
+ } else { \
+ uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
+ uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
+ foldlen -= UNISKIP( uvc ); \
+ scan = foldbuf + UNISKIP( uvc ); \
+ } \
+ } else { \
+ uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
+ } \
+ } else { \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
+} STMT_END
+
+
+#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
+#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
+#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
+#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
+
+#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
+ if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
+ TRIE_LIST_LEN( state ) *= 2; \
+ Renew( trie->states[ state ].trans.list, \
+ TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
+ } \
+ TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
+ TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
+ TRIE_LIST_CUR( state )++; \
+} STMT_END
+
+#define TRIE_LIST_NEW(state) STMT_START { \
+ Newz( 1023, trie->states[ state ].trans.list, \
+ 4, reg_trie_trans_le ); \
+ TRIE_LIST_CUR( state ) = 1; \
+ TRIE_LIST_LEN( state ) = 4; \
+} STMT_END
+
+STATIC I32
+S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
+{
+ dVAR;
+ /* first pass, loop through and scan words */
+ reg_trie_data *trie;
+ regnode *cur;
+ const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+ STRLEN len = 0;
+ UV uvc = 0;
+ U16 curword = 0;
+ U32 next_alloc = 0;
+ /* we just use folder as a flag in utf8 */
+ const U8 * const folder = ( flags == EXACTF
+ ? PL_fold
+ : ( flags == EXACTFL
+ ? PL_fold_locale
+ : NULL
+ )
+ );
+
+ const U32 data_slot = add_data( pRExC_state, 1, "t" );
+ SV *re_trie_maxbuff;
+
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ Newz( 848200, trie, 1, reg_trie_data );
+ trie->refcount = 1;
+ RExC_rx->data->data[ data_slot ] = (void*)trie;
+ Newz( 848201, trie->charmap, 256, U16 );
+ DEBUG_r({
+ trie->words = newAV();
+ trie->revcharmap = newAV();
+ });
+
+
+ re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+ if (!SvIOK(re_trie_maxbuff)) {
+ sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
+ }
+
+ /* -- First loop and Setup --
+
+ We first traverse the branches and scan each word to determine if it
+ contains widechars, and how many unique chars there are, this is
+ important as we have to build a table with at least as many columns as we
+ have unique chars.
+
+ We use an array of integers to represent the character codes 0..255
+ (trie->charmap) and we use a an HV* to store unicode characters. We use the
+ native representation of the character value as the key and IV's for the
+ coded index.
+
+ *TODO* If we keep track of how many times each character is used we can
+ remap the columns so that the table compression later on is more
+ efficient in terms of memory by ensuring most common value is in the
+ middle and the least common are on the outside. IMO this would be better
+ than a most to least common mapping as theres a decent chance the most
+ common letter will share a node with the least common, meaning the node
+ will not be compressable. With a middle is most common approach the worst
+ case is when we have the least common nodes twice.
+
+ */
+
+
+ for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
+ regnode *noper = NEXTOPER( cur );
+ const U8 *uc = (U8*)STRING( noper );
+ const U8 *e = uc + STR_LEN( noper );
+ STRLEN foldlen = 0;
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ const U8 *scan = (U8*)NULL;
+
+ for ( ; uc < e ; uc += len ) {
+ trie->charcount++;
+ TRIE_READ_CHAR;
+ if ( uvc < 256 ) {
+ if ( !trie->charmap[ uvc ] ) {
+ trie->charmap[ uvc ]=( ++trie->uniquecharcount );
+ if ( folder )
+ trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
+ TRIE_DEBUG_CHAR;
+ }
+ } else {
+ SV** svpp;
+ if ( !trie->widecharmap )
+ trie->widecharmap = newHV();
+
+ svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
+
+ if ( !svpp )
+ Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
+
+ if ( !SvTRUE( *svpp ) ) {
+ sv_setiv( *svpp, ++trie->uniquecharcount );
+ TRIE_DEBUG_CHAR;
+ }
+ }
+ }
+ trie->wordcount++;
+ } /* end first pass */
+ DEBUG_TRIE_COMPILE_r(
+ PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
+ ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
+ (int)trie->charcount, trie->uniquecharcount )
+ );
+
+
+ /*
+ We now know what we are dealing with in terms of unique chars and
+ string sizes so we can calculate how much memory a naive
+ representation using a flat table will take. If it's over a reasonable
+ limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
+ conservative but potentially much slower representation using an array
+ of lists.
+
+ At the end we convert both representations into the same compressed
+ form that will be used in regexec.c for matching with. The latter
+ is a form that cannot be used to construct with but has memory
+ properties similar to the list form and access properties similar
+ to the table form making it both suitable for fast searches and
+ small enough that its feasable to store for the duration of a program.
+
+ See the comment in the code where the compressed table is produced
+ inplace from the flat tabe representation for an explanation of how
+ the compression works.
+
+ */
+
+
+ if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
+ /*
+ Second Pass -- Array Of Lists Representation
+
+ Each state will be represented by a list of charid:state records
+ (reg_trie_trans_le) the first such element holds the CUR and LEN
+ points of the allocated array. (See defines above).
+
+ We build the initial structure using the lists, and then convert
+ it into the compressed table form which allows faster lookups
+ (but cant be modified once converted).
+
+
+ */
+
+
+ STRLEN transcount = 1;
+
+ Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
+ TRIE_LIST_NEW(1);
+ next_alloc = 2;
+
+ for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
+
+ regnode *noper = NEXTOPER( cur );
+ U8 *uc = (U8*)STRING( noper );
+ U8 *e = uc + STR_LEN( noper );
+ U32 state = 1; /* required init */
+ U16 charid = 0; /* sanity init */
+ U8 *scan = (U8*)NULL; /* sanity init */
+ STRLEN foldlen = 0; /* required init */
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
+
+ for ( ; uc < e ; uc += len ) {
+
+ TRIE_READ_CHAR;
+
+ if ( uvc < 256 ) {
+ charid = trie->charmap[ uvc ];
+ } else {
+ SV** svpp=(SV**)NULL;
+ svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
+ if ( !svpp ) {
+ charid = 0;
+ } else {
+ charid=(U16)SvIV( *svpp );
+ }
+ }
+ if ( charid ) {
+
+ U16 check;
+ U32 newstate = 0;
+
+ charid--;
+ if ( !trie->states[ state ].trans.list ) {
+ TRIE_LIST_NEW( state );
+ }
+ for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
+ if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
+ newstate = TRIE_LIST_ITEM( state, check ).newstate;
+ break;
+ }
+ }
+ if ( ! newstate ) {
+ newstate = next_alloc++;
+ TRIE_LIST_PUSH( state, charid, newstate );
+ transcount++;
+ }
+ state = newstate;
+
+ } else {
+ Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
+ }
+ /* charid is now 0 if we dont know the char read, or nonzero if we do */
+ }
+
+ if ( !trie->states[ state ].wordnum ) {
+ /* we havent inserted this word into the structure yet. */
+ trie->states[ state ].wordnum = ++curword;
+
+ DEBUG_r({
+ /* store the word for dumping */
+ SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
+ if ( UTF ) SvUTF8_on( tmp );
+ av_push( trie->words, tmp );
+ });
+
+ } else {
+ /* Its a dupe. So ignore it. */
+ }
+
+ } /* end second pass */
+
+ trie->laststate = next_alloc;
+ Renew( trie->states, next_alloc, reg_trie_state );
+
+ DEBUG_TRIE_COMPILE_MORE_r({
+ U32 state;
+ U16 charid;
+
+ /*
+ print out the table precompression.
+ */
+
+ PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
+ PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
+
+ for( state=1 ; state < next_alloc ; state ++ ) {
+
+ PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
+ if ( ! trie->states[ state ].wordnum ) {
+ PerlIO_printf( Perl_debug_log, "%5s| ","");
+ } else {
+ PerlIO_printf( Perl_debug_log, "W%04x| ",
+ trie->states[ state ].wordnum
+ );
+ }
+ for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
+ SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
+ PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
+ SvPV_nolen( *tmp ),
+ TRIE_LIST_ITEM(state,charid).forid,
+ (UV)TRIE_LIST_ITEM(state,charid).newstate
+ );
+ }
+
+ }
+ PerlIO_printf( Perl_debug_log, "\n\n" );
+ });
+
+ Newz( 848203, trie->trans, transcount ,reg_trie_trans );
+ {
+ U32 state;
+ U16 idx;
+ U32 tp = 0;
+ U32 zp = 0;
+
+
+ for( state=1 ; state < next_alloc ; state ++ ) {
+ U32 base=0;
+
+ /*
+ DEBUG_TRIE_COMPILE_MORE_r(
+ PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
+ );
+ */
+
+ if (trie->states[state].trans.list) {
+ U16 minid=TRIE_LIST_ITEM( state, 1).forid;
+ U16 maxid=minid;
+
+
+ for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
+ if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
+ minid=TRIE_LIST_ITEM( state, idx).forid;
+ } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
+ maxid=TRIE_LIST_ITEM( state, idx).forid;
+ }
+ }
+ if ( transcount < tp + maxid - minid + 1) {
+ transcount *= 2;
+ Renew( trie->trans, transcount, reg_trie_trans );
+ Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
+ }
+ base = trie->uniquecharcount + tp - minid;
+ if ( maxid == minid ) {
+ U32 set = 0;
+ for ( ; zp < tp ; zp++ ) {
+ if ( ! trie->trans[ zp ].next ) {
+ base = trie->uniquecharcount + zp - minid;
+ trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
+ trie->trans[ zp ].check = state;
+ set = 1;
+ break;
+ }
+ }
+ if ( !set ) {
+ trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
+ trie->trans[ tp ].check = state;
+ tp++;
+ zp = tp;
+ }
+ } else {
+ for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
+ U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
+ trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
+ trie->trans[ tid ].check = state;
+ }
+ tp += ( maxid - minid + 1 );
+ }
+ Safefree(trie->states[ state ].trans.list);
+ }
+ /*
+ DEBUG_TRIE_COMPILE_MORE_r(
+ PerlIO_printf( Perl_debug_log, " base: %d\n",base);
+ );
+ */
+ trie->states[ state ].trans.base=base;
+ }
+ trie->lasttrans = tp + 1;
+ }
+ } else {
+ /*
+ Second Pass -- Flat Table Representation.
+
+ we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
+ We know that we will need Charcount+1 trans at most to store the data
+ (one row per char at worst case) So we preallocate both structures
+ assuming worst case.
+
+ We then construct the trie using only the .next slots of the entry
+ structs.
+
+ We use the .check field of the first entry of the node temporarily to
+ make compression both faster and easier by keeping track of how many non
+ zero fields are in the node.
+
+ Since trans are numbered from 1 any 0 pointer in the table is a FAIL
+ transition.
+
+ There are two terms at use here: state as a TRIE_NODEIDX() which is a
+ number representing the first entry of the node, and state as a
+ TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
+ TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
+ are 2 entrys per node. eg:
+
+ A B A B
+ 1. 2 4 1. 3 7
+ 2. 0 3 3. 0 5
+ 3. 0 0 5. 0 0
+ 4. 0 0 7. 0 0
+
+ The table is internally in the right hand, idx form. However as we also
+ have to deal with the states array which is indexed by nodenum we have to
+ use TRIE_NODENUM() to convert.
+
+ */
+
+ Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
+ reg_trie_trans );
+ Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
+ next_alloc = trie->uniquecharcount + 1;
+
+ for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
+
+ regnode *noper = NEXTOPER( cur );
+ U8 *uc = (U8*)STRING( noper );
+ U8 *e = uc + STR_LEN( noper );
+
+ U32 state = 1; /* required init */
+
+ U16 charid = 0; /* sanity init */
+ U32 accept_state = 0; /* sanity init */
+ U8 *scan = (U8*)NULL; /* sanity init */
+
+ STRLEN foldlen = 0; /* required init */
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
+
+ for ( ; uc < e ; uc += len ) {
+
+ TRIE_READ_CHAR;
+
+ if ( uvc < 256 ) {
+ charid = trie->charmap[ uvc ];
+ } else {
+ SV** svpp=(SV**)NULL;
+ svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
+ if ( !svpp ) {
+ charid = 0;
+ } else {
+ charid=(U16)SvIV( *svpp );
+ }
+ }
+ if ( charid ) {
+ charid--;
+ if ( !trie->trans[ state + charid ].next ) {
+ trie->trans[ state + charid ].next = next_alloc;
+ trie->trans[ state ].check++;
+ next_alloc += trie->uniquecharcount;
+ }
+ state = trie->trans[ state + charid ].next;
+ } else {
+ Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
+ }
+ /* charid is now 0 if we dont know the char read, or nonzero if we do */
+ }
+
+ accept_state = TRIE_NODENUM( state );
+ if ( !trie->states[ accept_state ].wordnum ) {
+ /* we havent inserted this word into the structure yet. */
+ trie->states[ accept_state ].wordnum = ++curword;
+
+ DEBUG_r({
+ /* store the word for dumping */
+ SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
+ if ( UTF ) SvUTF8_on( tmp );
+ av_push( trie->words, tmp );
+ });
+
+ } else {
+ /* Its a dupe. So ignore it. */
+ }
+
+ } /* end second pass */
+
+ DEBUG_TRIE_COMPILE_MORE_r({
+ /*
+ print out the table precompression so that we can do a visual check
+ that they are identical.
+ */
+ U32 state;
+ U16 charid;
+ PerlIO_printf( Perl_debug_log, "\nChar : " );
+
+ for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
+ SV **tmp = av_fetch( trie->revcharmap, charid, 0);
+ if ( tmp ) {
+ PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
+ }
+ }
+
+ PerlIO_printf( Perl_debug_log, "\nState+-" );
+
+ for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
+ PerlIO_printf( Perl_debug_log, "%4s-", "----" );
+ }
+
+ PerlIO_printf( Perl_debug_log, "\n" );
+
+ for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
+
+ PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
+
+ for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
+ PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
+ (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
+ }
+ if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
+ PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
+ } else {
+ PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
+ trie->states[ TRIE_NODENUM( state ) ].wordnum );
+ }
+ }
+ PerlIO_printf( Perl_debug_log, "\n\n" );
+ });
+ {
+ /*
+ * Inplace compress the table.*
+
+ For sparse data sets the table constructed by the trie algorithm will
+ be mostly 0/FAIL transitions or to put it another way mostly empty.
+ (Note that leaf nodes will not contain any transitions.)
+
+ This algorithm compresses the tables by eliminating most such
+ transitions, at the cost of a modest bit of extra work during lookup:
+
+ - Each states[] entry contains a .base field which indicates the
+ index in the state[] array wheres its transition data is stored.
+
+ - If .base is 0 there are no valid transitions from that node.
+
+ - If .base is nonzero then charid is added to it to find an entry in
+ the trans array.
+
+ -If trans[states[state].base+charid].check!=state then the
+ transition is taken to be a 0/Fail transition. Thus if there are fail
+ transitions at the front of the node then the .base offset will point
+ somewhere inside the previous nodes data (or maybe even into a node
+ even earlier), but the .check field determines if the transition is
+ valid.
+
+ The following process inplace converts the table to the compressed
+ table: We first do not compress the root node 1,and mark its all its
+ .check pointers as 1 and set its .base pointer as 1 as well. This
+ allows to do a DFA construction from the compressed table later, and
+ ensures that any .base pointers we calculate later are greater than
+ 0.
+
+ - We set 'pos' to indicate the first entry of the second node.
+
+ - We then iterate over the columns of the node, finding the first and
+ last used entry at l and m. We then copy l..m into pos..(pos+m-l),
+ and set the .check pointers accordingly, and advance pos
+ appropriately and repreat for the next node. Note that when we copy
+ the next pointers we have to convert them from the original
+ NODEIDX form to NODENUM form as the former is not valid post
+ compression.
+
+ - If a node has no transitions used we mark its base as 0 and do not
+ advance the pos pointer.
+
+ - If a node only has one transition we use a second pointer into the
+ structure to fill in allocated fail transitions from other states.
+ This pointer is independent of the main pointer and scans forward
+ looking for null transitions that are allocated to a state. When it
+ finds one it writes the single transition into the "hole". If the
+ pointer doesnt find one the single transition is appeneded as normal.
+
+ - Once compressed we can Renew/realloc the structures to release the
+ excess space.
+
+ See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
+ specifically Fig 3.47 and the associated pseudocode.
+
+ demq
+ */
+ const U32 laststate = TRIE_NODENUM( next_alloc );
+ U32 used , state, charid;
+ U32 pos = 0, zp=0;
+ trie->laststate = laststate;
+
+ for ( state = 1 ; state < laststate ; state++ ) {
+ U8 flag = 0;
+ U32 stateidx = TRIE_NODEIDX( state );
+ U32 o_used=trie->trans[ stateidx ].check;
+ used = trie->trans[ stateidx ].check;
+ trie->trans[ stateidx ].check = 0;
+
+ for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
+ if ( flag || trie->trans[ stateidx + charid ].next ) {
+ if ( trie->trans[ stateidx + charid ].next ) {
+ if (o_used == 1) {
+ for ( ; zp < pos ; zp++ ) {
+ if ( ! trie->trans[ zp ].next ) {
+ break;
+ }
+ }
+ trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
+ trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
+ trie->trans[ zp ].check = state;
+ if ( ++zp > pos ) pos = zp;
+ break;
+ }
+ used--;
+ }
+ if ( !flag ) {
+ flag = 1;
+ trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
+ }
+ trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
+ trie->trans[ pos ].check = state;
+ pos++;
+ }
+ }
+ }
+ trie->lasttrans = pos + 1;
+ Renew( trie->states, laststate + 1, reg_trie_state);
+ DEBUG_TRIE_COMPILE_MORE_r(
+ PerlIO_printf( Perl_debug_log,
+ " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
+ (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
+ (IV)next_alloc,
+ (IV)pos,
+ ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
+ );
+
+ } /* end table compress */
+ }
+ /* resize the trans array to remove unused space */
+ Renew( trie->trans, trie->lasttrans, reg_trie_trans);
+
+ DEBUG_TRIE_COMPILE_r({
+ U32 state;
+ /*
+ Now we print it out again, in a slightly different form as there is additional
+ info we want to be able to see when its compressed. They are close enough for
+ visual comparison though.
+ */
+ PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
+
+ for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
+ SV **tmp = av_fetch( trie->revcharmap, state, 0);
+ if ( tmp ) {
+ PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
+ }
+ }
+ PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
+
+ for( state = 0 ; state < trie->uniquecharcount ; state++ )
+ PerlIO_printf( Perl_debug_log, "-----");
+ PerlIO_printf( Perl_debug_log, "\n");
+
+ for( state = 1 ; state < trie->laststate ; state++ ) {
+ U32 base = trie->states[ state ].trans.base;
+
+ PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
+
+ if ( trie->states[ state ].wordnum ) {
+ PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
+ } else {
+ PerlIO_printf( Perl_debug_log, "%6s", "" );
+ }
+
+ PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
+
+ if ( base ) {
+ U32 ofs = 0;
+
+ while( ( base + ofs < trie->uniquecharcount ) ||
+ ( base + ofs - trie->uniquecharcount < trie->lasttrans
+ && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
+ ofs++;
+
+ PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
+
+ for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
+ if ( ( base + ofs >= trie->uniquecharcount ) &&
+ ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
+ trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
+ {
+ PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
+ (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
+ } else {
+ PerlIO_printf( Perl_debug_log, "%4s "," 0" );
+ }
+ }
+
+ PerlIO_printf( Perl_debug_log, "]");
+
+ }
+ PerlIO_printf( Perl_debug_log, "\n" );
+ }
+ });
+
+ {
+ /* now finally we "stitch in" the new TRIE node
+ This means we convert either the first branch or the first Exact,
+ depending on whether the thing following (in 'last') is a branch
+ or not and whther first is the startbranch (ie is it a sub part of
+ the alternation or is it the whole thing.)
+ Assuming its a sub part we conver the EXACT otherwise we convert
+ the whole branch sequence, including the first.
+ */
+ regnode *convert;
+
+
+
+
+ if ( first == startbranch && OP( last ) != BRANCH ) {
+ convert = first;
+ } else {
+ convert = NEXTOPER( first );
+ NEXT_OFF( first ) = (U16)(last - first);
+ }
+
+ OP( convert ) = TRIE + (U8)( flags - EXACT );
+ NEXT_OFF( convert ) = (U16)(tail - convert);
+ ARG_SET( convert, data_slot );
+
+ /* tells us if we need to handle accept buffers specially */
+ convert->flags = ( RExC_seen_evals ? 1 : 0 );
+
+
+ /* needed for dumping*/
+ DEBUG_r({
+ regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
+ /* We now need to mark all of the space originally used by the
+ branches as optimized away. This keeps the dumpuntil from
+ throwing a wobbly as it doesnt use regnext() to traverse the
+ opcodes.
+ */
+ while( optimize < last ) {
+ OP( optimize ) = OPTIMIZED;
+ optimize++;
+ }
+ });
+ } /* end node insert */
+ return 1;
+}
+
+
+
+/*