This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta for fea90cfbe1f
[perl5.git] / regcomp_trie.c
CommitLineData
85900e28
YO
1#ifdef PERL_EXT_RE_BUILD
2#include "re_top.h"
3#endif
4
5#include "EXTERN.h"
6#define PERL_IN_REGEX_ENGINE
7#define PERL_IN_REGCOMP_ANY
8#define PERL_IN_REGCOMP_TRIE_C
9#include "perl.h"
10
11#ifdef PERL_IN_XSUB_RE
12# include "re_comp.h"
13#else
14# include "regcomp.h"
15#endif
16
17#include "invlist_inline.h"
18#include "unicode_constants.h"
19#include "regcomp_internal.h"
20
21#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
22#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
23#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
24#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
25 ? (TRIE_LIST_CUR( idx ) - 1) \
26 : 0 )
27
28
29#ifdef DEBUGGING
30/*
31 dump_trie(trie,widecharmap,revcharmap)
32 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
33 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
34
35 These routines dump out a trie in a somewhat readable format.
36 The _interim_ variants are used for debugging the interim
37 tables that are used to generate the final compressed
38 representation which is what dump_trie expects.
39
40 Part of the reason for their existence is to provide a form
41 of documentation as to how the different representations function.
42
43*/
44
45/*
46 Dumps the final compressed table form of the trie to Perl_debug_log.
47 Used for debugging make_trie().
48*/
49
50STATIC void
51S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
52 AV *revcharmap, U32 depth)
53{
54 U32 state;
55 SV *sv=sv_newmortal();
56 int colwidth= widecharmap ? 6 : 4;
57 U16 word;
58 DECLARE_AND_GET_RE_DEBUG_FLAGS;
59
60 PERL_ARGS_ASSERT_DUMP_TRIE;
61
62 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
63 depth+1, "Match","Base","Ofs" );
64
65 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
66 SV ** const tmp = av_fetch_simple( revcharmap, state, 0);
67 if ( tmp ) {
68 Perl_re_printf( aTHX_ "%*s",
69 colwidth,
70 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
71 PL_colors[0], PL_colors[1],
72 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
73 PERL_PV_ESCAPE_FIRSTCHAR
74 )
75 );
76 }
77 }
78 Perl_re_printf( aTHX_ "\n");
79 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
80
81 for( state = 0 ; state < trie->uniquecharcount ; state++ )
82 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
83 Perl_re_printf( aTHX_ "\n");
84
85 for( state = 1 ; state < trie->statecount ; state++ ) {
86 const U32 base = trie->states[ state ].trans.base;
87
88 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
89
90 if ( trie->states[ state ].wordnum ) {
91 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
92 } else {
93 Perl_re_printf( aTHX_ "%6s", "" );
94 }
95
96 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
97
98 if ( base ) {
99 U32 ofs = 0;
100
101 while( ( base + ofs < trie->uniquecharcount ) ||
102 ( base + ofs - trie->uniquecharcount < trie->lasttrans
103 && trie->trans[ base + ofs - trie->uniquecharcount ].check
104 != state))
105 ofs++;
106
107 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
108
109 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
110 if ( ( base + ofs >= trie->uniquecharcount )
111 && ( base + ofs - trie->uniquecharcount
112 < trie->lasttrans )
113 && trie->trans[ base + ofs
114 - trie->uniquecharcount ].check == state )
115 {
116 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
117 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
118 );
119 } else {
120 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
121 }
122 }
123
124 Perl_re_printf( aTHX_ "]");
125
126 }
127 Perl_re_printf( aTHX_ "\n" );
128 }
129 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
130 depth);
131 for (word=1; word <= trie->wordcount; word++) {
132 Perl_re_printf( aTHX_ " %d:(%d,%d)",
133 (int)word, (int)(trie->wordinfo[word].prev),
134 (int)(trie->wordinfo[word].len));
135 }
136 Perl_re_printf( aTHX_ "\n" );
137}
138/*
139 Dumps a fully constructed but uncompressed trie in list form.
140 List tries normally only are used for construction when the number of
141 possible chars (trie->uniquecharcount) is very high.
142 Used for debugging make_trie().
143*/
144STATIC void
145S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
146 HV *widecharmap, AV *revcharmap, U32 next_alloc,
147 U32 depth)
148{
149 U32 state;
150 SV *sv=sv_newmortal();
151 int colwidth= widecharmap ? 6 : 4;
152 DECLARE_AND_GET_RE_DEBUG_FLAGS;
153
154 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
155
156 /* print out the table precompression. */
157 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
158 depth+1 );
159 Perl_re_indentf( aTHX_ "%s",
160 depth+1, "------:-----+-----------------\n" );
161
162 for( state=1 ; state < next_alloc ; state ++ ) {
163 U16 charid;
164
165 Perl_re_indentf( aTHX_ " %4" UVXf " :",
166 depth+1, (UV)state );
167 if ( ! trie->states[ state ].wordnum ) {
168 Perl_re_printf( aTHX_ "%5s| ","");
169 } else {
170 Perl_re_printf( aTHX_ "W%4x| ",
171 trie->states[ state ].wordnum
172 );
173 }
174 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
175 SV ** const tmp = av_fetch_simple( revcharmap,
176 TRIE_LIST_ITEM(state, charid).forid, 0);
177 if ( tmp ) {
178 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
179 colwidth,
180 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
181 colwidth,
182 PL_colors[0], PL_colors[1],
183 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
184 | PERL_PV_ESCAPE_FIRSTCHAR
185 ) ,
186 TRIE_LIST_ITEM(state, charid).forid,
187 (UV)TRIE_LIST_ITEM(state, charid).newstate
188 );
189 if (!(charid % 10))
190 Perl_re_printf( aTHX_ "\n%*s| ",
191 (int)((depth * 2) + 14), "");
192 }
193 }
194 Perl_re_printf( aTHX_ "\n");
195 }
196}
197
198/*
199 Dumps a fully constructed but uncompressed trie in table form.
200 This is the normal DFA style state transition table, with a few
201 twists to facilitate compression later.
202 Used for debugging make_trie().
203*/
204STATIC void
205S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
206 HV *widecharmap, AV *revcharmap, U32 next_alloc,
207 U32 depth)
208{
209 U32 state;
210 U16 charid;
211 SV *sv=sv_newmortal();
212 int colwidth= widecharmap ? 6 : 4;
213 DECLARE_AND_GET_RE_DEBUG_FLAGS;
214
215 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
216
217 /*
218 print out the table precompression so that we can do a visual check
219 that they are identical.
220 */
221
222 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
223
224 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
225 SV ** const tmp = av_fetch_simple( revcharmap, charid, 0);
226 if ( tmp ) {
227 Perl_re_printf( aTHX_ "%*s",
228 colwidth,
229 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
230 PL_colors[0], PL_colors[1],
231 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
232 PERL_PV_ESCAPE_FIRSTCHAR
233 )
234 );
235 }
236 }
237
238 Perl_re_printf( aTHX_ "\n");
239 Perl_re_indentf( aTHX_ "State+-", depth+1 );
240
241 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
242 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
243 }
244
245 Perl_re_printf( aTHX_ "\n" );
246
247 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
248
249 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
250 depth+1,
251 (UV)TRIE_NODENUM( state ) );
252
253 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
254 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
255 if (v)
256 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
257 else
258 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
259 }
260 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
261 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
262 (UV)trie->trans[ state ].check );
263 } else {
264 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
265 (UV)trie->trans[ state ].check,
266 trie->states[ TRIE_NODENUM( state ) ].wordnum );
267 }
268 }
269}
270
271#endif
272
273
274/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
275 startbranch: the first branch in the whole branch sequence
276 first : start branch of sequence of branch-exact nodes.
277 May be the same as startbranch
278 last : Thing following the last branch.
279 May be the same as tail.
280 tail : item following the branch sequence
281 count : words in the sequence
282 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
283 depth : indent depth
284
285Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
286
287A trie is an N'ary tree where the branches are determined by digital
288decomposition of the key. IE, at the root node you look up the 1st character and
289follow that branch repeat until you find the end of the branches. Nodes can be
290marked as "accepting" meaning they represent a complete word. Eg:
291
292 /he|she|his|hers/
293
294would convert into the following structure. Numbers represent states, letters
295following numbers represent valid transitions on the letter from that state, if
296the number is in square brackets it represents an accepting state, otherwise it
297will be in parenthesis.
298
299 +-h->+-e->[3]-+-r->(8)-+-s->[9]
300 | |
301 | (2)
302 | |
303 (1) +-i->(6)-+-s->[7]
304 |
305 +-s->(3)-+-h->(4)-+-e->[5]
306
307 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
308
309This shows that when matching against the string 'hers' we will begin at state 1
310read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
311then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
312is also accepting. Thus we know that we can match both 'he' and 'hers' with a
313single traverse. We store a mapping from accepting to state to which word was
314matched, and then when we have multiple possibilities we try to complete the
315rest of the regex in the order in which they occurred in the alternation.
316
317The only prior NFA like behaviour that would be changed by the TRIE support is
318the silent ignoring of duplicate alternations which are of the form:
319
320 / (DUPE|DUPE) X? (?{ ... }) Y /x
321
322Thus EVAL blocks following a trie may be called a different number of times with
323and without the optimisation. With the optimisations dupes will be silently
324ignored. This inconsistent behaviour of EVAL type nodes is well established as
325the following demonstrates:
326
327 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
328
329which prints out 'word' three times, but
330
331 'words'=~/(word|word|word)(?{ print $1 })S/
332
333which doesnt print it out at all. This is due to other optimisations kicking in.
334
335Example of what happens on a structural level:
336
337The regexp /(ac|ad|ab)+/ will produce the following debug output:
338
339 1: CURLYM[1] {1,32767}(18)
340 5: BRANCH(8)
341 6: EXACT <ac>(16)
342 8: BRANCH(11)
343 9: EXACT <ad>(16)
344 11: BRANCH(14)
345 12: EXACT <ab>(16)
346 16: SUCCEED(0)
347 17: NOTHING(18)
348 18: END(0)
349
350This would be optimizable with startbranch=5, first=5, last=16, tail=16
351and should turn into:
352
353 1: CURLYM[1] {1,32767}(18)
354 5: TRIE(16)
355 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
356 <ac>
357 <ad>
358 <ab>
359 16: SUCCEED(0)
360 17: NOTHING(18)
361 18: END(0)
362
363Cases where tail != last would be like /(?foo|bar)baz/:
364
365 1: BRANCH(4)
366 2: EXACT <foo>(8)
367 4: BRANCH(7)
368 5: EXACT <bar>(8)
369 7: TAIL(8)
370 8: EXACT <baz>(10)
371 10: END(0)
372
373which would be optimizable with startbranch=1, first=1, last=7, tail=8
374and would end up looking like:
375
376 1: TRIE(8)
377 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
378 <foo>
379 <bar>
380 7: TAIL(8)
381 8: EXACT <baz>(10)
382 10: END(0)
383
384 d = uvchr_to_utf8_flags(d, uv, 0);
385
386is the recommended Unicode-aware way of saying
387
388 *(d++) = uv;
389*/
390
391#define TRIE_STORE_REVCHAR(val) \
392 STMT_START { \
393 if (UTF) { \
394 SV *zlopp = newSV(UTF8_MAXBYTES); \
395 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
396 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
397 *kapow = '\0'; \
398 SvCUR_set(zlopp, kapow - flrbbbbb); \
399 SvPOK_on(zlopp); \
400 SvUTF8_on(zlopp); \
401 av_push_simple(revcharmap, zlopp); \
402 } else { \
403 char ooooff = (char)val; \
404 av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \
405 } \
406 } STMT_END
407
408/* This gets the next character from the input, folding it if not already
409 * folded. */
410#define TRIE_READ_CHAR STMT_START { \
411 wordlen++; \
412 if ( UTF ) { \
413 /* if it is UTF then it is either already folded, or does not need \
414 * folding */ \
415 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
416 } \
417 else if (folder == PL_fold_latin1) { \
418 /* This folder implies Unicode rules, which in the range expressible \
419 * by not UTF is the lower case, with the two exceptions, one of \
420 * which should have been taken care of before calling this */ \
421 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
422 uvc = toLOWER_L1(*uc); \
423 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
424 len = 1; \
425 } else { \
426 /* raw data, will be folded later if needed */ \
427 uvc = (U32)*uc; \
428 len = 1; \
429 } \
430} STMT_END
431
432
433
434#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
435 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
436 U32 ging = TRIE_LIST_LEN( state ) * 2; \
437 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
438 TRIE_LIST_LEN( state ) = ging; \
439 } \
440 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
441 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
442 TRIE_LIST_CUR( state )++; \
443} STMT_END
444
445#define TRIE_LIST_NEW(state) STMT_START { \
446 Newx( trie->states[ state ].trans.list, \
447 4, reg_trie_trans_le ); \
448 TRIE_LIST_CUR( state ) = 1; \
449 TRIE_LIST_LEN( state ) = 4; \
450} STMT_END
451
452#define TRIE_HANDLE_WORD(state) STMT_START { \
453 U16 dupe= trie->states[ state ].wordnum; \
454 regnode * const noper_next = regnext( noper ); \
455 \
456 DEBUG_r({ \
457 /* store the word for dumping */ \
458 SV* tmp; \
459 if (OP(noper) != NOTHING) \
460 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
461 else \
462 tmp = newSVpvn_utf8( "", 0, UTF ); \
463 av_push_simple( trie_words, tmp ); \
464 }); \
465 \
466 curword++; \
467 trie->wordinfo[curword].prev = 0; \
468 trie->wordinfo[curword].len = wordlen; \
469 trie->wordinfo[curword].accept = state; \
470 \
471 if ( noper_next < tail ) { \
acababb4 472 if (!trie->jump) { \
85900e28
YO
473 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
474 sizeof(U16) ); \
acababb4
YO
475 trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
476 sizeof(U16) ); \
477 trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
478 sizeof(U16) ); \
479 } \
85900e28 480 trie->jump[curword] = (U16)(noper_next - convert); \
acababb4
YO
481 U16 set_before_paren; \
482 U16 set_after_paren; \
483 if (OP(cur) == BRANCH) { \
17e3e02a
YO
484 set_before_paren = ARG1a(cur); \
485 set_after_paren = ARG1b(cur); \
acababb4 486 } else { \
17e3e02a
YO
487 set_before_paren = ARG2a(cur); \
488 set_after_paren = ARG2b(cur); \
acababb4
YO
489 } \
490 trie->j_before_paren[curword] = set_before_paren; \
491 trie->j_after_paren[curword] = set_after_paren; \
85900e28
YO
492 if (!jumper) \
493 jumper = noper_next; \
494 if (!nextbranch) \
495 nextbranch= regnext(cur); \
496 } \
497 \
498 if ( dupe ) { \
499 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
500 /* chain, so that when the bits of chain are later */\
501 /* linked together, the dups appear in the chain */\
502 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
503 trie->wordinfo[dupe].prev = curword; \
504 } else { \
505 /* we haven't inserted this word yet. */ \
506 trie->states[ state ].wordnum = curword; \
507 } \
508} STMT_END
509
510
511#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
512 ( ( base + charid >= ucharcount \
513 && base + charid < ubound \
514 && state == trie->trans[ base - ucharcount + charid ].check \
515 && trie->trans[ base - ucharcount + charid ].next ) \
516 ? trie->trans[ base - ucharcount + charid ].next \
517 : ( state==1 ? special : 0 ) \
518 )
519
520#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
521STMT_START { \
522 TRIE_BITMAP_SET(trie, uvc); \
523 /* store the folded codepoint */ \
524 if ( folder ) \
525 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
526 \
527 if ( !UTF ) { \
528 /* store first byte of utf8 representation of */ \
529 /* variant codepoints */ \
530 if (! UVCHR_IS_INVARIANT(uvc)) { \
531 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
532 } \
533 } \
534} STMT_END
535
536I32
537Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
538 regnode *first, regnode *last, regnode *tail,
539 U32 word_count, U32 flags, U32 depth)
540{
541 /* first pass, loop through and scan words */
542 reg_trie_data *trie;
543 HV *widecharmap = NULL;
544 AV *revcharmap = newAV();
545 regnode *cur;
546 STRLEN len = 0;
547 UV uvc = 0;
548 U16 curword = 0;
549 U32 next_alloc = 0;
550 regnode *jumper = NULL;
551 regnode *nextbranch = NULL;
acababb4 552 regnode *lastbranch = NULL;
85900e28
YO
553 regnode *convert = NULL;
554 U32 *prev_states; /* temp array mapping each state to previous one */
555 /* we just use folder as a flag in utf8 */
556 const U8 * folder = NULL;
557
558 /* in the below reg_add_data call we are storing either 'tu' or 'tuaa'
559 * which stands for one trie structure, one hash, optionally followed
560 * by two arrays */
561#ifdef DEBUGGING
562 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tuaa"));
563 AV *trie_words = NULL;
564 /* along with revcharmap, this only used during construction but both are
565 * useful during debugging so we store them in the struct when debugging.
566 */
567#else
568 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tu"));
569 STRLEN trie_charcount=0;
570#endif
571 SV *re_trie_maxbuff;
572 DECLARE_AND_GET_RE_DEBUG_FLAGS;
573
574 PERL_ARGS_ASSERT_MAKE_TRIE;
575#ifndef DEBUGGING
576 PERL_UNUSED_ARG(depth);
577#endif
578
579 switch (flags) {
580 case EXACT: case EXACT_REQ8: case EXACTL: break;
581 case EXACTFAA:
582 case EXACTFUP:
583 case EXACTFU:
584 case EXACTFLU8: folder = PL_fold_latin1; break;
585 case EXACTF: folder = PL_fold; break;
586 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
587 }
588
acababb4 589 /* create the trie struct, all zeroed */
85900e28
YO
590 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
591 trie->refcount = 1;
592 trie->startstate = 1;
593 trie->wordcount = word_count;
594 RExC_rxi->data->data[ data_slot ] = (void*)trie;
595 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
596 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
597 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
598 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
599 trie->wordcount+1, sizeof(reg_trie_wordinfo));
600
601 DEBUG_r({
602 trie_words = newAV();
603 });
604
605 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
606 assert(re_trie_maxbuff);
607 if (!SvIOK(re_trie_maxbuff)) {
608 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
609 }
610 DEBUG_TRIE_COMPILE_r({
611 Perl_re_indentf( aTHX_
612 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
613 depth+1,
614 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
615 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
616 });
617
618 /* Find the node we are going to overwrite */
619 if ( first == startbranch && OP( last ) != BRANCH ) {
620 /* whole branch chain */
621 convert = first;
622 } else {
623 /* branch sub-chain */
624 convert = REGNODE_AFTER( first );
625 }
626
627 /* -- First loop and Setup --
628
629 We first traverse the branches and scan each word to determine if it
630 contains widechars, and how many unique chars there are, this is
631 important as we have to build a table with at least as many columns as we
632 have unique chars.
633
634 We use an array of integers to represent the character codes 0..255
635 (trie->charmap) and we use a an HV* to store Unicode characters. We use
636 the native representation of the character value as the key and IV's for
637 the coded index.
638
639 *TODO* If we keep track of how many times each character is used we can
640 remap the columns so that the table compression later on is more
641 efficient in terms of memory by ensuring the most common value is in the
642 middle and the least common are on the outside. IMO this would be better
643 than a most to least common mapping as theres a decent chance the most
644 common letter will share a node with the least common, meaning the node
645 will not be compressible. With a middle is most common approach the worst
646 case is when we have the least common nodes twice.
647
648 */
649
650 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
651 regnode *noper = REGNODE_AFTER( cur );
652 const U8 *uc;
653 const U8 *e;
654 int foldlen = 0;
655 U32 wordlen = 0; /* required init */
656 STRLEN minchars = 0;
657 STRLEN maxchars = 0;
658 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
659 bitmap?*/
acababb4 660 lastbranch = cur;
85900e28
YO
661
662 if (OP(noper) == NOTHING) {
663 /* skip past a NOTHING at the start of an alternation
664 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
665 *
666 * If the next node is not something we are supposed to process
667 * we will just ignore it due to the condition guarding the
668 * next block.
669 */
670
671 regnode *noper_next= regnext(noper);
672 if (noper_next < tail)
673 noper= noper_next;
674 }
675
676 if ( noper < tail
677 && ( OP(noper) == flags
678 || (flags == EXACT && OP(noper) == EXACT_REQ8)
679 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
680 || OP(noper) == EXACTFUP))))
681 {
682 uc= (U8*)STRING(noper);
683 e= uc + STR_LEN(noper);
684 } else {
685 trie->minlen= 0;
686 continue;
687 }
688
689
690 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
691 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
692 regardless of encoding */
693 if (OP( noper ) == EXACTFUP) {
694 /* false positives are ok, so just set this */
695 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
696 }
697 }
698
699 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
700 branch */
701 TRIE_CHARCOUNT(trie)++;
702 TRIE_READ_CHAR;
703
704 /* TRIE_READ_CHAR returns the current character, or its fold if /i
705 * is in effect. Under /i, this character can match itself, or
706 * anything that folds to it. If not under /i, it can match just
707 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
708 * all fold to k, and all are single characters. But some folds
709 * expand to more than one character, so for example LATIN SMALL
710 * LIGATURE FFI folds to the three character sequence 'ffi'. If
711 * the string beginning at 'uc' is 'ffi', it could be matched by
712 * three characters, or just by the one ligature character. (It
713 * could also be matched by two characters: LATIN SMALL LIGATURE FF
714 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
715 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
716 * match.) The trie needs to know the minimum and maximum number
717 * of characters that could match so that it can use size alone to
718 * quickly reject many match attempts. The max is simple: it is
719 * the number of folded characters in this branch (since a fold is
720 * never shorter than what folds to it. */
721
722 maxchars++;
723
724 /* And the min is equal to the max if not under /i (indicated by
725 * 'folder' being NULL), or there are no multi-character folds. If
726 * there is a multi-character fold, the min is incremented just
727 * once, for the character that folds to the sequence. Each
728 * character in the sequence needs to be added to the list below of
729 * characters in the trie, but we count only the first towards the
730 * min number of characters needed. This is done through the
731 * variable 'foldlen', which is returned by the macros that look
732 * for these sequences as the number of bytes the sequence
733 * occupies. Each time through the loop, we decrement 'foldlen' by
734 * how many bytes the current char occupies. Only when it reaches
735 * 0 do we increment 'minchars' or look for another multi-character
736 * sequence. */
737 if (folder == NULL) {
738 minchars++;
739 }
740 else if (foldlen > 0) {
741 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
742 }
743 else {
744 minchars++;
745
746 /* See if *uc is the beginning of a multi-character fold. If
747 * so, we decrement the length remaining to look at, to account
748 * for the current character this iteration. (We can use 'uc'
749 * instead of the fold returned by TRIE_READ_CHAR because the
750 * macro is smart enough to account for any unfolded
751 * characters. */
752 if (UTF) {
753 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
754 foldlen -= UTF8SKIP(uc);
755 }
756 }
757 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
758 foldlen--;
759 }
760 }
761
762 /* The current character (and any potential folds) should be added
763 * to the possible matching characters for this position in this
764 * branch */
765 if ( uvc < 256 ) {
766 if ( folder ) {
767 U8 folded= folder[ (U8) uvc ];
768 if ( !trie->charmap[ folded ] ) {
769 trie->charmap[ folded ]=( ++trie->uniquecharcount );
770 TRIE_STORE_REVCHAR( folded );
771 }
772 }
773 if ( !trie->charmap[ uvc ] ) {
774 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
775 TRIE_STORE_REVCHAR( uvc );
776 }
777 if ( set_bit ) {
778 /* store the codepoint in the bitmap, and its folded
779 * equivalent. */
780 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
781 set_bit = 0; /* We've done our bit :-) */
782 }
783 } else {
784
785 /* XXX We could come up with the list of code points that fold
786 * to this using PL_utf8_foldclosures, except not for
787 * multi-char folds, as there may be multiple combinations
788 * there that could work, which needs to wait until runtime to
789 * resolve (The comment about LIGATURE FFI above is such an
790 * example */
791
792 SV** svpp;
793 if ( !widecharmap )
794 widecharmap = newHV();
795
796 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
797
798 if ( !svpp )
799 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
800
801 if ( !SvTRUE( *svpp ) ) {
802 sv_setiv( *svpp, ++trie->uniquecharcount );
803 TRIE_STORE_REVCHAR(uvc);
804 }
805 }
806 } /* end loop through characters in this branch of the trie */
807
808 /* We take the min and max for this branch and combine to find the min
809 * and max for all branches processed so far */
810 if( cur == first ) {
811 trie->minlen = minchars;
812 trie->maxlen = maxchars;
813 } else if (minchars < trie->minlen) {
814 trie->minlen = minchars;
815 } else if (maxchars > trie->maxlen) {
816 trie->maxlen = maxchars;
817 }
818 } /* end first pass */
acababb4 819 trie->before_paren = OP(first) == BRANCH
17e3e02a
YO
820 ? ARG1a(first)
821 : ARG2a(first); /* BRANCHJ */
acababb4
YO
822
823 trie->after_paren = OP(lastbranch) == BRANCH
17e3e02a
YO
824 ? ARG1b(lastbranch)
825 : ARG2b(lastbranch); /* BRANCHJ */
85900e28
YO
826 DEBUG_TRIE_COMPILE_r(
827 Perl_re_indentf( aTHX_
828 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
829 depth+1,
830 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
831 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
832 (int)trie->minlen, (int)trie->maxlen )
833 );
834
835 /*
836 We now know what we are dealing with in terms of unique chars and
837 string sizes so we can calculate how much memory a naive
838 representation using a flat table will take. If it's over a reasonable
839 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
840 conservative but potentially much slower representation using an array
841 of lists.
842
843 At the end we convert both representations into the same compressed
844 form that will be used in regexec.c for matching with. The latter
845 is a form that cannot be used to construct with but has memory
846 properties similar to the list form and access properties similar
847 to the table form making it both suitable for fast searches and
848 small enough that its feasable to store for the duration of a program.
849
850 See the comment in the code where the compressed table is produced
851 inplace from the flat tabe representation for an explanation of how
852 the compression works.
853
854 */
855
856
857 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
858 prev_states[1] = 0;
859
860 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
861 > SvIV(re_trie_maxbuff) )
862 {
863 /*
864 Second Pass -- Array Of Lists Representation
865
866 Each state will be represented by a list of charid:state records
867 (reg_trie_trans_le) the first such element holds the CUR and LEN
868 points of the allocated array. (See defines above).
869
870 We build the initial structure using the lists, and then convert
871 it into the compressed table form which allows faster lookups
872 (but cant be modified once converted).
873 */
874
875 STRLEN transcount = 1;
876
877 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
878 depth+1));
879
880 trie->states = (reg_trie_state *)
881 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
882 sizeof(reg_trie_state) );
883 TRIE_LIST_NEW(1);
884 next_alloc = 2;
885
886 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
887
888 regnode *noper = REGNODE_AFTER( cur );
889 U32 state = 1; /* required init */
890 U16 charid = 0; /* sanity init */
891 U32 wordlen = 0; /* required init */
892
893 if (OP(noper) == NOTHING) {
894 regnode *noper_next= regnext(noper);
895 if (noper_next < tail)
896 noper= noper_next;
897 /* we will undo this assignment if noper does not
898 * point at a trieable type in the else clause of
899 * the following statement. */
900 }
901
902 if ( noper < tail
903 && ( OP(noper) == flags
904 || (flags == EXACT && OP(noper) == EXACT_REQ8)
905 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
906 || OP(noper) == EXACTFUP))))
907 {
908 const U8 *uc= (U8*)STRING(noper);
909 const U8 *e= uc + STR_LEN(noper);
910
911 for ( ; uc < e ; uc += len ) {
912
913 TRIE_READ_CHAR;
914
915 if ( uvc < 256 ) {
916 charid = trie->charmap[ uvc ];
917 } else {
918 SV** const svpp = hv_fetch( widecharmap,
919 (char*)&uvc,
920 sizeof( UV ),
921 0);
922 if ( !svpp ) {
923 charid = 0;
924 } else {
925 charid=(U16)SvIV( *svpp );
926 }
927 }
928 /* charid is now 0 if we dont know the char read, or
929 * nonzero if we do */
930 if ( charid ) {
931
932 U16 check;
933 U32 newstate = 0;
934
935 charid--;
936 if ( !trie->states[ state ].trans.list ) {
937 TRIE_LIST_NEW( state );
938 }
939 for ( check = 1;
940 check <= TRIE_LIST_USED( state );
941 check++ )
942 {
943 if ( TRIE_LIST_ITEM( state, check ).forid
944 == charid )
945 {
946 newstate = TRIE_LIST_ITEM( state, check ).newstate;
947 break;
948 }
949 }
950 if ( ! newstate ) {
951 newstate = next_alloc++;
952 prev_states[newstate] = state;
953 TRIE_LIST_PUSH( state, charid, newstate );
954 transcount++;
955 }
956 state = newstate;
957 } else {
958 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
959 }
960 }
961 } else {
962 /* If we end up here it is because we skipped past a NOTHING, but did not end up
963 * on a trieable type. So we need to reset noper back to point at the first regop
964 * in the branch before we call TRIE_HANDLE_WORD()
965 */
966 noper= REGNODE_AFTER(cur);
967 }
968 TRIE_HANDLE_WORD(state);
969
970 } /* end second pass */
971
972 /* next alloc is the NEXT state to be allocated */
973 trie->statecount = next_alloc;
974 trie->states = (reg_trie_state *)
975 PerlMemShared_realloc( trie->states,
976 next_alloc
977 * sizeof(reg_trie_state) );
978
979 /* and now dump it out before we compress it */
980 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
981 revcharmap, next_alloc,
982 depth+1)
983 );
984
985 trie->trans = (reg_trie_trans *)
986 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
987 {
988 U32 state;
989 U32 tp = 0;
990 U32 zp = 0;
991
992
993 for( state=1 ; state < next_alloc ; state ++ ) {
994 U32 base=0;
995
996 /*
997 DEBUG_TRIE_COMPILE_MORE_r(
998 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
999 );
1000 */
1001
1002 if (trie->states[state].trans.list) {
1003 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1004 U16 maxid=minid;
1005 U16 idx;
1006
1007 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1008 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1009 if ( forid < minid ) {
1010 minid=forid;
1011 } else if ( forid > maxid ) {
1012 maxid=forid;
1013 }
1014 }
1015 if ( transcount < tp + maxid - minid + 1) {
1016 transcount *= 2;
1017 trie->trans = (reg_trie_trans *)
1018 PerlMemShared_realloc( trie->trans,
1019 transcount
1020 * sizeof(reg_trie_trans) );
1021 Zero( trie->trans + (transcount / 2),
1022 transcount / 2,
1023 reg_trie_trans );
1024 }
1025 base = trie->uniquecharcount + tp - minid;
1026 if ( maxid == minid ) {
1027 U32 set = 0;
1028 for ( ; zp < tp ; zp++ ) {
1029 if ( ! trie->trans[ zp ].next ) {
1030 base = trie->uniquecharcount + zp - minid;
1031 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
1032 1).newstate;
1033 trie->trans[ zp ].check = state;
1034 set = 1;
1035 break;
1036 }
1037 }
1038 if ( !set ) {
1039 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
1040 1).newstate;
1041 trie->trans[ tp ].check = state;
1042 tp++;
1043 zp = tp;
1044 }
1045 } else {
1046 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1047 const U32 tid = base
1048 - trie->uniquecharcount
1049 + TRIE_LIST_ITEM( state, idx ).forid;
1050 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
1051 idx ).newstate;
1052 trie->trans[ tid ].check = state;
1053 }
1054 tp += ( maxid - minid + 1 );
1055 }
1056 Safefree(trie->states[ state ].trans.list);
1057 }
1058 /*
1059 DEBUG_TRIE_COMPILE_MORE_r(
1060 Perl_re_printf( aTHX_ " base: %d\n",base);
1061 );
1062 */
1063 trie->states[ state ].trans.base=base;
1064 }
1065 trie->lasttrans = tp + 1;
1066 }
1067 } else {
1068 /*
1069 Second Pass -- Flat Table Representation.
1070
1071 we dont use the 0 slot of either trans[] or states[] so we add 1 to
1072 each. We know that we will need Charcount+1 trans at most to store
1073 the data (one row per char at worst case) So we preallocate both
1074 structures assuming worst case.
1075
1076 We then construct the trie using only the .next slots of the entry
1077 structs.
1078
1079 We use the .check field of the first entry of the node temporarily
1080 to make compression both faster and easier by keeping track of how
1081 many non zero fields are in the node.
1082
1083 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1084 transition.
1085
1086 There are two terms at use here: state as a TRIE_NODEIDX() which is
1087 a number representing the first entry of the node, and state as a
1088 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
1089 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
1090 if there are 2 entrys per node. eg:
1091
1092 A B A B
1093 1. 2 4 1. 3 7
1094 2. 0 3 3. 0 5
1095 3. 0 0 5. 0 0
1096 4. 0 0 7. 0 0
1097
1098 The table is internally in the right hand, idx form. However as we
1099 also have to deal with the states array which is indexed by nodenum
1100 we have to use TRIE_NODENUM() to convert.
1101
1102 */
1103 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
1104 depth+1));
1105
1106 trie->trans = (reg_trie_trans *)
1107 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1108 * trie->uniquecharcount + 1,
1109 sizeof(reg_trie_trans) );
1110 trie->states = (reg_trie_state *)
1111 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1112 sizeof(reg_trie_state) );
1113 next_alloc = trie->uniquecharcount + 1;
1114
1115
1116 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1117
1118 regnode *noper = REGNODE_AFTER( cur );
1119
1120 U32 state = 1; /* required init */
1121
1122 U16 charid = 0; /* sanity init */
1123 U32 accept_state = 0; /* sanity init */
1124
1125 U32 wordlen = 0; /* required init */
1126
1127 if (OP(noper) == NOTHING) {
1128 regnode *noper_next= regnext(noper);
1129 if (noper_next < tail)
1130 noper= noper_next;
1131 /* we will undo this assignment if noper does not
1132 * point at a trieable type in the else clause of
1133 * the following statement. */
1134 }
1135
1136 if ( noper < tail
1137 && ( OP(noper) == flags
1138 || (flags == EXACT && OP(noper) == EXACT_REQ8)
1139 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
1140 || OP(noper) == EXACTFUP))))
1141 {
1142 const U8 *uc= (U8*)STRING(noper);
1143 const U8 *e= uc + STR_LEN(noper);
1144
1145 for ( ; uc < e ; uc += len ) {
1146
1147 TRIE_READ_CHAR;
1148
1149 if ( uvc < 256 ) {
1150 charid = trie->charmap[ uvc ];
1151 } else {
1152 SV* const * const svpp = hv_fetch( widecharmap,
1153 (char*)&uvc,
1154 sizeof( UV ),
1155 0);
1156 charid = svpp ? (U16)SvIV(*svpp) : 0;
1157 }
1158 if ( charid ) {
1159 charid--;
1160 if ( !trie->trans[ state + charid ].next ) {
1161 trie->trans[ state + charid ].next = next_alloc;
1162 trie->trans[ state ].check++;
1163 prev_states[TRIE_NODENUM(next_alloc)]
1164 = TRIE_NODENUM(state);
1165 next_alloc += trie->uniquecharcount;
1166 }
1167 state = trie->trans[ state + charid ].next;
1168 } else {
1169 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
1170 }
1171 /* charid is now 0 if we dont know the char read, or
1172 * nonzero if we do */
1173 }
1174 } else {
1175 /* If we end up here it is because we skipped past a NOTHING, but did not end up
1176 * on a trieable type. So we need to reset noper back to point at the first regop
1177 * in the branch before we call TRIE_HANDLE_WORD().
1178 */
1179 noper= REGNODE_AFTER(cur);
1180 }
1181 accept_state = TRIE_NODENUM( state );
1182 TRIE_HANDLE_WORD(accept_state);
1183
1184 } /* end second pass */
1185
1186 /* and now dump it out before we compress it */
1187 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1188 revcharmap,
1189 next_alloc, depth+1));
1190
1191 {
1192 /*
1193 * Inplace compress the table.*
1194
1195 For sparse data sets the table constructed by the trie algorithm will
1196 be mostly 0/FAIL transitions or to put it another way mostly empty.
1197 (Note that leaf nodes will not contain any transitions.)
1198
1199 This algorithm compresses the tables by eliminating most such
1200 transitions, at the cost of a modest bit of extra work during lookup:
1201
1202 - Each states[] entry contains a .base field which indicates the
1203 index in the state[] array wheres its transition data is stored.
1204
1205 - If .base is 0 there are no valid transitions from that node.
1206
1207 - If .base is nonzero then charid is added to it to find an entry in
1208 the trans array.
1209
1210 -If trans[states[state].base+charid].check!=state then the
1211 transition is taken to be a 0/Fail transition. Thus if there are fail
1212 transitions at the front of the node then the .base offset will point
1213 somewhere inside the previous nodes data (or maybe even into a node
1214 even earlier), but the .check field determines if the transition is
1215 valid.
1216
1217 XXX - wrong maybe?
1218 The following process inplace converts the table to the compressed
1219 table: We first do not compress the root node 1,and mark all its
1220 .check pointers as 1 and set its .base pointer as 1 as well. This
1221 allows us to do a DFA construction from the compressed table later,
1222 and ensures that any .base pointers we calculate later are greater
1223 than 0.
1224
1225 - We set 'pos' to indicate the first entry of the second node.
1226
1227 - We then iterate over the columns of the node, finding the first and
1228 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1229 and set the .check pointers accordingly, and advance pos
1230 appropriately and repreat for the next node. Note that when we copy
1231 the next pointers we have to convert them from the original
1232 NODEIDX form to NODENUM form as the former is not valid post
1233 compression.
1234
1235 - If a node has no transitions used we mark its base as 0 and do not
1236 advance the pos pointer.
1237
1238 - If a node only has one transition we use a second pointer into the
1239 structure to fill in allocated fail transitions from other states.
1240 This pointer is independent of the main pointer and scans forward
1241 looking for null transitions that are allocated to a state. When it
1242 finds one it writes the single transition into the "hole". If the
1243 pointer doesnt find one the single transition is appended as normal.
1244
1245 - Once compressed we can Renew/realloc the structures to release the
1246 excess space.
1247
1248 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1249 specifically Fig 3.47 and the associated pseudocode.
1250
1251 demq
1252 */
1253 const U32 laststate = TRIE_NODENUM( next_alloc );
1254 U32 state, charid;
1255 U32 pos = 0, zp=0;
1256 trie->statecount = laststate;
1257
1258 for ( state = 1 ; state < laststate ; state++ ) {
1259 U8 flag = 0;
1260 const U32 stateidx = TRIE_NODEIDX( state );
1261 const U32 o_used = trie->trans[ stateidx ].check;
1262 U32 used = trie->trans[ stateidx ].check;
1263 trie->trans[ stateidx ].check = 0;
1264
1265 for ( charid = 0;
1266 used && charid < trie->uniquecharcount;
1267 charid++ )
1268 {
1269 if ( flag || trie->trans[ stateidx + charid ].next ) {
1270 if ( trie->trans[ stateidx + charid ].next ) {
1271 if (o_used == 1) {
1272 for ( ; zp < pos ; zp++ ) {
1273 if ( ! trie->trans[ zp ].next ) {
1274 break;
1275 }
1276 }
1277 trie->states[ state ].trans.base
1278 = zp
1279 + trie->uniquecharcount
1280 - charid ;
1281 trie->trans[ zp ].next
1282 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
1283 + charid ].next );
1284 trie->trans[ zp ].check = state;
1285 if ( ++zp > pos ) pos = zp;
1286 break;
1287 }
1288 used--;
1289 }
1290 if ( !flag ) {
1291 flag = 1;
1292 trie->states[ state ].trans.base
1293 = pos + trie->uniquecharcount - charid ;
1294 }
1295 trie->trans[ pos ].next
1296 = SAFE_TRIE_NODENUM(
1297 trie->trans[ stateidx + charid ].next );
1298 trie->trans[ pos ].check = state;
1299 pos++;
1300 }
1301 }
1302 }
1303 trie->lasttrans = pos + 1;
1304 trie->states = (reg_trie_state *)
1305 PerlMemShared_realloc( trie->states, laststate
1306 * sizeof(reg_trie_state) );
1307 DEBUG_TRIE_COMPILE_MORE_r(
1308 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
1309 depth+1,
1310 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
1311 + 1 ),
1312 (IV)next_alloc,
1313 (IV)pos,
1314 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1315 );
1316
1317 } /* end table compress */
1318 }
1319 DEBUG_TRIE_COMPILE_MORE_r(
1320 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
1321 depth+1,
1322 (UV)trie->statecount,
1323 (UV)trie->lasttrans)
1324 );
1325 /* resize the trans array to remove unused space */
1326 trie->trans = (reg_trie_trans *)
1327 PerlMemShared_realloc( trie->trans, trie->lasttrans
1328 * sizeof(reg_trie_trans) );
1329
1330 { /* Modify the program and insert the new TRIE node */
1331 U8 nodetype =(U8) flags;
1332 char *str=NULL;
1333
1334#ifdef DEBUGGING
1335 regnode *optimize = NULL;
1336#endif /* DEBUGGING */
3645ca4e
YO
1337 /* make sure we have enough room to inject the TRIE op */
1338 assert((!trie->jump) || !trie->jump[1] ||
1339 (trie->jump[1] >= (sizeof(tregnode_TRIE)/sizeof(struct regnode))));
85900e28
YO
1340 /*
1341 This means we convert either the first branch or the first Exact,
1342 depending on whether the thing following (in 'last') is a branch
1343 or not and whther first is the startbranch (ie is it a sub part of
1344 the alternation or is it the whole thing.)
1345 Assuming its a sub part we convert the EXACT otherwise we convert
1346 the whole branch sequence, including the first.
1347 */
1348 /* Find the node we are going to overwrite */
1349 if ( first != startbranch || OP( last ) == BRANCH ) {
1350 /* branch sub-chain */
1351 NEXT_OFF( first ) = (U16)(last - first);
1352 /* whole branch chain */
1353 }
1354 /* But first we check to see if there is a common prefix we can
1355 split out as an EXACT and put in front of the TRIE node. */
1356 trie->startstate= 1;
1357 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1358 /* we want to find the first state that has more than
1359 * one transition, if that state is not the first state
1360 * then we have a common prefix which we can remove.
1361 */
1362 U32 state;
1363 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1364 U32 ofs = 0;
1365 I32 first_ofs = -1; /* keeps track of the ofs of the first
1366 transition, -1 means none */
1367 U32 count = 0;
1368 const U32 base = trie->states[ state ].trans.base;
1369
1370 /* does this state terminate an alternation? */
1371 if ( trie->states[state].wordnum )
1372 count = 1;
1373
1374 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1375 if ( ( base + ofs >= trie->uniquecharcount ) &&
1376 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1377 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1378 {
1379 if ( ++count > 1 ) {
1380 /* we have more than one transition */
1381 SV **tmp;
1382 U8 *ch;
1383 /* if this is the first state there is no common prefix
1384 * to extract, so we can exit */
1385 if ( state == 1 ) break;
1386 tmp = av_fetch_simple( revcharmap, ofs, 0);
1387 ch = (U8*)SvPV_nolen_const( *tmp );
1388
1389 /* if we are on count 2 then we need to initialize the
1390 * bitmap, and store the previous char if there was one
1391 * in it*/
1392 if ( count == 2 ) {
1393 /* clear the bitmap */
1394 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1395 DEBUG_OPTIMISE_r(
1396 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
1397 depth+1,
1398 (UV)state));
1399 if (first_ofs >= 0) {
1400 SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0);
1401 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1402
1403 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
1404 DEBUG_OPTIMISE_r(
1405 Perl_re_printf( aTHX_ "%s", (char*)ch)
1406 );
1407 }
1408 }
1409 /* store the current firstchar in the bitmap */
1410 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
1411 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
1412 }
1413 first_ofs = ofs;
1414 }
1415 }
1416 if ( count == 1 ) {
1417 /* This state has only one transition, its transition is part
1418 * of a common prefix - we need to concatenate the char it
1419 * represents to what we have so far. */
1420 SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0);
1421 STRLEN len;
1422 char *ch = SvPV( *tmp, len );
1423 DEBUG_OPTIMISE_r({
1424 SV *sv=sv_newmortal();
1425 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
1426 depth+1,
1427 (UV)state, (UV)first_ofs,
1428 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1429 PL_colors[0], PL_colors[1],
1430 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1431 PERL_PV_ESCAPE_FIRSTCHAR
1432 )
1433 );
1434 });
1435 if ( state==1 ) {
1436 OP( convert ) = nodetype;
1437 str=STRING(convert);
1438 setSTR_LEN(convert, 0);
1439 }
1440 assert( ( STR_LEN(convert) + len ) < 256 );
1441 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
1442 while (len--)
1443 *str++ = *ch++;
1444 } else {
1445#ifdef DEBUGGING
1446 if (state>1)
1447 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
1448#endif
1449 break;
1450 }
1451 }
1452 trie->prefixlen = (state-1);
1453 if (str) {
1454 regnode *n = REGNODE_AFTER(convert);
1455 assert( n - convert <= U16_MAX );
1456 NEXT_OFF(convert) = n - convert;
1457 trie->startstate = state;
1458 trie->minlen -= (state - 1);
1459 trie->maxlen -= (state - 1);
1460#ifdef DEBUGGING
1461 /* At least the UNICOS C compiler choked on this
1462 * being argument to DEBUG_r(), so let's just have
1463 * it right here. */
1464 if (
1465#ifdef PERL_EXT_RE_BUILD
1466 1
1467#else
1468 DEBUG_r_TEST
1469#endif
1470 ) {
1471 U32 word = trie->wordcount;
1472 while (word--) {
1473 SV ** const tmp = av_fetch_simple( trie_words, word, 0 );
1474 if (tmp) {
1475 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1476 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1477 else
1478 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1479 }
1480 }
1481 }
1482#endif
1483 if (trie->maxlen) {
1484 convert = n;
1485 } else {
1486 NEXT_OFF(convert) = (U16)(tail - convert);
1487 DEBUG_r(optimize= n);
1488 }
1489 }
1490 }
1491 if (!jumper)
1492 jumper = last;
1493 if ( trie->maxlen ) {
1494 NEXT_OFF( convert ) = (U16)(tail - convert);
17e3e02a 1495 ARG1u_SET( convert, data_slot );
85900e28
YO
1496 /* Store the offset to the first unabsorbed branch in
1497 jump[0], which is otherwise unused by the jump logic.
1498 We use this when dumping a trie and during optimisation. */
1499 if (trie->jump)
1500 trie->jump[0] = (U16)(nextbranch - convert);
1501
1502 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
1503 * and there is a bitmap
1504 * and the first "jump target" node we found leaves enough room
1505 * then convert the TRIE node into a TRIEC node, with the bitmap
1506 * embedded inline in the opcode - this is hypothetically faster.
1507 */
1508 if ( !trie->states[trie->startstate].wordnum
1509 && trie->bitmap
3645ca4e 1510 && ( (char *)jumper - (char *)convert) >= (int)sizeof(tregnode_TRIEC) )
85900e28
YO
1511 {
1512 OP( convert ) = TRIEC;
3645ca4e 1513 Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
85900e28
YO
1514 PerlMemShared_free(trie->bitmap);
1515 trie->bitmap= NULL;
1516 } else
1517 OP( convert ) = TRIE;
1518
1519 /* store the type in the flags */
44eb4cdc 1520 FLAGS(convert) = nodetype;
85900e28
YO
1521 DEBUG_r({
1522 optimize = convert
1523 + NODE_STEP_REGNODE
1524 + REGNODE_ARG_LEN( OP( convert ) );
1525 });
1526 /* XXX We really should free up the resource in trie now,
1527 as we won't use them - (which resources?) dmq */
1528 }
1529 /* needed for dumping*/
1530 DEBUG_r(if (optimize) {
1531 /*
1532 Try to clean up some of the debris left after the
1533 optimisation.
1534 */
1535 while( optimize < jumper ) {
1536 OP( optimize ) = OPTIMIZED;
1537 optimize++;
1538 }
1539 });
1540 } /* end node insert */
1541
1542 /* Finish populating the prev field of the wordinfo array. Walk back
1543 * from each accept state until we find another accept state, and if
1544 * so, point the first word's .prev field at the second word. If the
1545 * second already has a .prev field set, stop now. This will be the
1546 * case either if we've already processed that word's accept state,
1547 * or that state had multiple words, and the overspill words were
1548 * already linked up earlier.
1549 */
1550 {
1551 U16 word;
1552 U32 state;
1553 U16 prev;
1554
1555 for (word=1; word <= trie->wordcount; word++) {
1556 prev = 0;
1557 if (trie->wordinfo[word].prev)
1558 continue;
1559 state = trie->wordinfo[word].accept;
1560 while (state) {
1561 state = prev_states[state];
1562 if (!state)
1563 break;
1564 prev = trie->states[state].wordnum;
1565 if (prev)
1566 break;
1567 }
1568 trie->wordinfo[word].prev = prev;
1569 }
1570 Safefree(prev_states);
1571 }
1572
1573
1574 /* and now dump out the compressed format */
1575 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1576
1577 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
1578#ifdef DEBUGGING
1579 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
1580 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
1581#else
1582 SvREFCNT_dec_NN(revcharmap);
1583#endif
1584 return trie->jump
1585 ? MADE_JUMP_TRIE
1586 : trie->startstate>1
1587 ? MADE_EXACT_TRIE
1588 : MADE_TRIE;
1589}
1590
1591regnode *
1592Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
1593{
1594/* The Trie is constructed and compressed now so we can build a fail array if
1595 * it's needed
1596
1597 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
1598 3.32 in the
1599 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
1600 Ullman 1985/88
1601 ISBN 0-201-10088-6
1602
1603 We find the fail state for each state in the trie, this state is the longest
1604 proper suffix of the current state's 'word' that is also a proper prefix of
1605 another word in our trie. State 1 represents the word '' and is thus the
1606 default fail state. This allows the DFA not to have to restart after its
1607 tried and failed a word at a given point, it simply continues as though it
1608 had been matching the other word in the first place.
1609 Consider
1610 'abcdgu'=~/abcdefg|cdgu/
1611 When we get to 'd' we are still matching the first word, we would encounter
1612 'g' which would fail, which would bring us to the state representing 'd' in
1613 the second word where we would try 'g' and succeed, proceeding to match
1614 'cdgu'.
1615 */
1616 /* add a fail transition */
17e3e02a 1617 const U32 trie_offset = ARG1u(source);
85900e28
YO
1618 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
1619 U32 *q;
1620 const U32 ucharcount = trie->uniquecharcount;
1621 const U32 numstates = trie->statecount;
1622 const U32 ubound = trie->lasttrans + ucharcount;
1623 U32 q_read = 0;
1624 U32 q_write = 0;
1625 U32 charid;
1626 U32 base = trie->states[ 1 ].trans.base;
1627 U32 *fail;
1628 reg_ac_data *aho;
1629 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("T"));
1630 regnode *stclass;
1631 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1632
1633 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
1634 PERL_UNUSED_CONTEXT;
1635#ifndef DEBUGGING
1636 PERL_UNUSED_ARG(depth);
1637#endif
1638
1639 if ( OP(source) == TRIE ) {
3645ca4e
YO
1640 tregnode_TRIE *op = (tregnode_TRIE *)
1641 PerlMemShared_calloc(1, sizeof(tregnode_TRIE));
1642 StructCopy(source, op, tregnode_TRIE);
85900e28
YO
1643 stclass = (regnode *)op;
1644 } else {
3645ca4e
YO
1645 tregnode_TRIEC *op = (tregnode_TRIEC *)
1646 PerlMemShared_calloc(1, sizeof(tregnode_TRIEC));
1647 StructCopy(source, op, tregnode_TRIEC);
85900e28
YO
1648 stclass = (regnode *)op;
1649 }
1650 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
1651
17e3e02a 1652 ARG1u_SET( stclass, data_slot );
85900e28
YO
1653 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
1654 RExC_rxi->data->data[ data_slot ] = (void*)aho;
1655 aho->trie=trie_offset;
1656 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
1657 Copy( trie->states, aho->states, numstates, reg_trie_state );
1658 Newx( q, numstates, U32);
1659 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
1660 aho->refcount = 1;
1661 fail = aho->fail;
1662 /* initialize fail[0..1] to be 1 so that we always have
1663 a valid final fail state */
1664 fail[ 0 ] = fail[ 1 ] = 1;
1665
1666 for ( charid = 0; charid < ucharcount ; charid++ ) {
1667 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1668 if ( newstate ) {
1669 q[ q_write ] = newstate;
1670 /* set to point at the root */
1671 fail[ q[ q_write++ ] ]=1;
1672 }
1673 }
1674 while ( q_read < q_write) {
1675 const U32 cur = q[ q_read++ % numstates ];
1676 base = trie->states[ cur ].trans.base;
1677
1678 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1679 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1680 if (ch_state) {
1681 U32 fail_state = cur;
1682 U32 fail_base;
1683 do {
1684 fail_state = fail[ fail_state ];
1685 fail_base = aho->states[ fail_state ].trans.base;
1686 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1687
1688 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1689 fail[ ch_state ] = fail_state;
1690 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1691 {
1692 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1693 }
1694 q[ q_write++ % numstates] = ch_state;
1695 }
1696 }
1697 }
1698 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
1699 when we fail in state 1, this allows us to use the
1700 charclass scan to find a valid start char. This is based on the principle
1701 that theres a good chance the string being searched contains lots of stuff
1702 that cant be a start char.
1703 */
1704 fail[ 0 ] = fail[ 1 ] = 0;
1705 DEBUG_TRIE_COMPILE_r({
1706 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
1707 depth, (UV)numstates
1708 );
1709 for( q_read=1; q_read<numstates; q_read++ ) {
1710 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
1711 }
1712 Perl_re_printf( aTHX_ "\n");
1713 });
1714 Safefree(q);
1715 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
1716 return stclass;
1717}