Commit | Line | Data |
---|---|---|
85900e28 YO |
1 | #ifdef PERL_EXT_RE_BUILD |
2 | #include "re_top.h" | |
3 | #endif | |
4 | ||
5 | #include "EXTERN.h" | |
6 | #define PERL_IN_REGEX_ENGINE | |
7 | #define PERL_IN_REGCOMP_ANY | |
8 | #define PERL_IN_REGCOMP_TRIE_C | |
9 | #include "perl.h" | |
10 | ||
11 | #ifdef PERL_IN_XSUB_RE | |
12 | # include "re_comp.h" | |
13 | #else | |
14 | # include "regcomp.h" | |
15 | #endif | |
16 | ||
17 | #include "invlist_inline.h" | |
18 | #include "unicode_constants.h" | |
19 | #include "regcomp_internal.h" | |
20 | ||
21 | #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] | |
22 | #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) | |
23 | #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) | |
24 | #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ | |
25 | ? (TRIE_LIST_CUR( idx ) - 1) \ | |
26 | : 0 ) | |
27 | ||
28 | ||
29 | #ifdef DEBUGGING | |
30 | /* | |
31 | dump_trie(trie,widecharmap,revcharmap) | |
32 | dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) | |
33 | dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) | |
34 | ||
35 | These routines dump out a trie in a somewhat readable format. | |
36 | The _interim_ variants are used for debugging the interim | |
37 | tables that are used to generate the final compressed | |
38 | representation which is what dump_trie expects. | |
39 | ||
40 | Part of the reason for their existence is to provide a form | |
41 | of documentation as to how the different representations function. | |
42 | ||
43 | */ | |
44 | ||
45 | /* | |
46 | Dumps the final compressed table form of the trie to Perl_debug_log. | |
47 | Used for debugging make_trie(). | |
48 | */ | |
49 | ||
50 | STATIC void | |
51 | S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, | |
52 | AV *revcharmap, U32 depth) | |
53 | { | |
54 | U32 state; | |
55 | SV *sv=sv_newmortal(); | |
56 | int colwidth= widecharmap ? 6 : 4; | |
57 | U16 word; | |
58 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
59 | ||
60 | PERL_ARGS_ASSERT_DUMP_TRIE; | |
61 | ||
62 | Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", | |
63 | depth+1, "Match","Base","Ofs" ); | |
64 | ||
65 | for( state = 0 ; state < trie->uniquecharcount ; state++ ) { | |
66 | SV ** const tmp = av_fetch_simple( revcharmap, state, 0); | |
67 | if ( tmp ) { | |
68 | Perl_re_printf( aTHX_ "%*s", | |
69 | colwidth, | |
70 | pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, | |
71 | PL_colors[0], PL_colors[1], | |
72 | (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | | |
73 | PERL_PV_ESCAPE_FIRSTCHAR | |
74 | ) | |
75 | ); | |
76 | } | |
77 | } | |
78 | Perl_re_printf( aTHX_ "\n"); | |
79 | Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); | |
80 | ||
81 | for( state = 0 ; state < trie->uniquecharcount ; state++ ) | |
82 | Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); | |
83 | Perl_re_printf( aTHX_ "\n"); | |
84 | ||
85 | for( state = 1 ; state < trie->statecount ; state++ ) { | |
86 | const U32 base = trie->states[ state ].trans.base; | |
87 | ||
88 | Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state); | |
89 | ||
90 | if ( trie->states[ state ].wordnum ) { | |
91 | Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); | |
92 | } else { | |
93 | Perl_re_printf( aTHX_ "%6s", "" ); | |
94 | } | |
95 | ||
96 | Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base ); | |
97 | ||
98 | if ( base ) { | |
99 | U32 ofs = 0; | |
100 | ||
101 | while( ( base + ofs < trie->uniquecharcount ) || | |
102 | ( base + ofs - trie->uniquecharcount < trie->lasttrans | |
103 | && trie->trans[ base + ofs - trie->uniquecharcount ].check | |
104 | != state)) | |
105 | ofs++; | |
106 | ||
107 | Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs); | |
108 | ||
109 | for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { | |
110 | if ( ( base + ofs >= trie->uniquecharcount ) | |
111 | && ( base + ofs - trie->uniquecharcount | |
112 | < trie->lasttrans ) | |
113 | && trie->trans[ base + ofs | |
114 | - trie->uniquecharcount ].check == state ) | |
115 | { | |
116 | Perl_re_printf( aTHX_ "%*" UVXf, colwidth, | |
117 | (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next | |
118 | ); | |
119 | } else { | |
120 | Perl_re_printf( aTHX_ "%*s", colwidth," ." ); | |
121 | } | |
122 | } | |
123 | ||
124 | Perl_re_printf( aTHX_ "]"); | |
125 | ||
126 | } | |
127 | Perl_re_printf( aTHX_ "\n" ); | |
128 | } | |
129 | Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", | |
130 | depth); | |
131 | for (word=1; word <= trie->wordcount; word++) { | |
132 | Perl_re_printf( aTHX_ " %d:(%d,%d)", | |
133 | (int)word, (int)(trie->wordinfo[word].prev), | |
134 | (int)(trie->wordinfo[word].len)); | |
135 | } | |
136 | Perl_re_printf( aTHX_ "\n" ); | |
137 | } | |
138 | /* | |
139 | Dumps a fully constructed but uncompressed trie in list form. | |
140 | List tries normally only are used for construction when the number of | |
141 | possible chars (trie->uniquecharcount) is very high. | |
142 | Used for debugging make_trie(). | |
143 | */ | |
144 | STATIC void | |
145 | S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, | |
146 | HV *widecharmap, AV *revcharmap, U32 next_alloc, | |
147 | U32 depth) | |
148 | { | |
149 | U32 state; | |
150 | SV *sv=sv_newmortal(); | |
151 | int colwidth= widecharmap ? 6 : 4; | |
152 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
153 | ||
154 | PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; | |
155 | ||
156 | /* print out the table precompression. */ | |
157 | Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", | |
158 | depth+1 ); | |
159 | Perl_re_indentf( aTHX_ "%s", | |
160 | depth+1, "------:-----+-----------------\n" ); | |
161 | ||
162 | for( state=1 ; state < next_alloc ; state ++ ) { | |
163 | U16 charid; | |
164 | ||
165 | Perl_re_indentf( aTHX_ " %4" UVXf " :", | |
166 | depth+1, (UV)state ); | |
167 | if ( ! trie->states[ state ].wordnum ) { | |
168 | Perl_re_printf( aTHX_ "%5s| ",""); | |
169 | } else { | |
170 | Perl_re_printf( aTHX_ "W%4x| ", | |
171 | trie->states[ state ].wordnum | |
172 | ); | |
173 | } | |
174 | for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { | |
175 | SV ** const tmp = av_fetch_simple( revcharmap, | |
176 | TRIE_LIST_ITEM(state, charid).forid, 0); | |
177 | if ( tmp ) { | |
178 | Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ", | |
179 | colwidth, | |
180 | pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), | |
181 | colwidth, | |
182 | PL_colors[0], PL_colors[1], | |
183 | (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | |
184 | | PERL_PV_ESCAPE_FIRSTCHAR | |
185 | ) , | |
186 | TRIE_LIST_ITEM(state, charid).forid, | |
187 | (UV)TRIE_LIST_ITEM(state, charid).newstate | |
188 | ); | |
189 | if (!(charid % 10)) | |
190 | Perl_re_printf( aTHX_ "\n%*s| ", | |
191 | (int)((depth * 2) + 14), ""); | |
192 | } | |
193 | } | |
194 | Perl_re_printf( aTHX_ "\n"); | |
195 | } | |
196 | } | |
197 | ||
198 | /* | |
199 | Dumps a fully constructed but uncompressed trie in table form. | |
200 | This is the normal DFA style state transition table, with a few | |
201 | twists to facilitate compression later. | |
202 | Used for debugging make_trie(). | |
203 | */ | |
204 | STATIC void | |
205 | S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, | |
206 | HV *widecharmap, AV *revcharmap, U32 next_alloc, | |
207 | U32 depth) | |
208 | { | |
209 | U32 state; | |
210 | U16 charid; | |
211 | SV *sv=sv_newmortal(); | |
212 | int colwidth= widecharmap ? 6 : 4; | |
213 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
214 | ||
215 | PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; | |
216 | ||
217 | /* | |
218 | print out the table precompression so that we can do a visual check | |
219 | that they are identical. | |
220 | */ | |
221 | ||
222 | Perl_re_indentf( aTHX_ "Char : ", depth+1 ); | |
223 | ||
224 | for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { | |
225 | SV ** const tmp = av_fetch_simple( revcharmap, charid, 0); | |
226 | if ( tmp ) { | |
227 | Perl_re_printf( aTHX_ "%*s", | |
228 | colwidth, | |
229 | pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, | |
230 | PL_colors[0], PL_colors[1], | |
231 | (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | | |
232 | PERL_PV_ESCAPE_FIRSTCHAR | |
233 | ) | |
234 | ); | |
235 | } | |
236 | } | |
237 | ||
238 | Perl_re_printf( aTHX_ "\n"); | |
239 | Perl_re_indentf( aTHX_ "State+-", depth+1 ); | |
240 | ||
241 | for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { | |
242 | Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); | |
243 | } | |
244 | ||
245 | Perl_re_printf( aTHX_ "\n" ); | |
246 | ||
247 | for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { | |
248 | ||
249 | Perl_re_indentf( aTHX_ "%4" UVXf " : ", | |
250 | depth+1, | |
251 | (UV)TRIE_NODENUM( state ) ); | |
252 | ||
253 | for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { | |
254 | UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); | |
255 | if (v) | |
256 | Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v ); | |
257 | else | |
258 | Perl_re_printf( aTHX_ "%*s", colwidth, "." ); | |
259 | } | |
260 | if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { | |
261 | Perl_re_printf( aTHX_ " (%4" UVXf ")\n", | |
262 | (UV)trie->trans[ state ].check ); | |
263 | } else { | |
264 | Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n", | |
265 | (UV)trie->trans[ state ].check, | |
266 | trie->states[ TRIE_NODENUM( state ) ].wordnum ); | |
267 | } | |
268 | } | |
269 | } | |
270 | ||
271 | #endif | |
272 | ||
273 | ||
274 | /* make_trie(startbranch,first,last,tail,word_count,flags,depth) | |
275 | startbranch: the first branch in the whole branch sequence | |
276 | first : start branch of sequence of branch-exact nodes. | |
277 | May be the same as startbranch | |
278 | last : Thing following the last branch. | |
279 | May be the same as tail. | |
280 | tail : item following the branch sequence | |
281 | count : words in the sequence | |
282 | flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ | |
283 | depth : indent depth | |
284 | ||
285 | Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. | |
286 | ||
287 | A trie is an N'ary tree where the branches are determined by digital | |
288 | decomposition of the key. IE, at the root node you look up the 1st character and | |
289 | follow that branch repeat until you find the end of the branches. Nodes can be | |
290 | marked as "accepting" meaning they represent a complete word. Eg: | |
291 | ||
292 | /he|she|his|hers/ | |
293 | ||
294 | would convert into the following structure. Numbers represent states, letters | |
295 | following numbers represent valid transitions on the letter from that state, if | |
296 | the number is in square brackets it represents an accepting state, otherwise it | |
297 | will be in parenthesis. | |
298 | ||
299 | +-h->+-e->[3]-+-r->(8)-+-s->[9] | |
300 | | | | |
301 | | (2) | |
302 | | | | |
303 | (1) +-i->(6)-+-s->[7] | |
304 | | | |
305 | +-s->(3)-+-h->(4)-+-e->[5] | |
306 | ||
307 | Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) | |
308 | ||
309 | This shows that when matching against the string 'hers' we will begin at state 1 | |
310 | read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, | |
311 | then read 'r' and go to state 8 followed by 's' which takes us to state 9 which | |
312 | is also accepting. Thus we know that we can match both 'he' and 'hers' with a | |
313 | single traverse. We store a mapping from accepting to state to which word was | |
314 | matched, and then when we have multiple possibilities we try to complete the | |
315 | rest of the regex in the order in which they occurred in the alternation. | |
316 | ||
317 | The only prior NFA like behaviour that would be changed by the TRIE support is | |
318 | the silent ignoring of duplicate alternations which are of the form: | |
319 | ||
320 | / (DUPE|DUPE) X? (?{ ... }) Y /x | |
321 | ||
322 | Thus EVAL blocks following a trie may be called a different number of times with | |
323 | and without the optimisation. With the optimisations dupes will be silently | |
324 | ignored. This inconsistent behaviour of EVAL type nodes is well established as | |
325 | the following demonstrates: | |
326 | ||
327 | 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ | |
328 | ||
329 | which prints out 'word' three times, but | |
330 | ||
331 | 'words'=~/(word|word|word)(?{ print $1 })S/ | |
332 | ||
333 | which doesnt print it out at all. This is due to other optimisations kicking in. | |
334 | ||
335 | Example of what happens on a structural level: | |
336 | ||
337 | The regexp /(ac|ad|ab)+/ will produce the following debug output: | |
338 | ||
339 | 1: CURLYM[1] {1,32767}(18) | |
340 | 5: BRANCH(8) | |
341 | 6: EXACT <ac>(16) | |
342 | 8: BRANCH(11) | |
343 | 9: EXACT <ad>(16) | |
344 | 11: BRANCH(14) | |
345 | 12: EXACT <ab>(16) | |
346 | 16: SUCCEED(0) | |
347 | 17: NOTHING(18) | |
348 | 18: END(0) | |
349 | ||
350 | This would be optimizable with startbranch=5, first=5, last=16, tail=16 | |
351 | and should turn into: | |
352 | ||
353 | 1: CURLYM[1] {1,32767}(18) | |
354 | 5: TRIE(16) | |
355 | [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] | |
356 | <ac> | |
357 | <ad> | |
358 | <ab> | |
359 | 16: SUCCEED(0) | |
360 | 17: NOTHING(18) | |
361 | 18: END(0) | |
362 | ||
363 | Cases where tail != last would be like /(?foo|bar)baz/: | |
364 | ||
365 | 1: BRANCH(4) | |
366 | 2: EXACT <foo>(8) | |
367 | 4: BRANCH(7) | |
368 | 5: EXACT <bar>(8) | |
369 | 7: TAIL(8) | |
370 | 8: EXACT <baz>(10) | |
371 | 10: END(0) | |
372 | ||
373 | which would be optimizable with startbranch=1, first=1, last=7, tail=8 | |
374 | and would end up looking like: | |
375 | ||
376 | 1: TRIE(8) | |
377 | [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] | |
378 | <foo> | |
379 | <bar> | |
380 | 7: TAIL(8) | |
381 | 8: EXACT <baz>(10) | |
382 | 10: END(0) | |
383 | ||
384 | d = uvchr_to_utf8_flags(d, uv, 0); | |
385 | ||
386 | is the recommended Unicode-aware way of saying | |
387 | ||
388 | *(d++) = uv; | |
389 | */ | |
390 | ||
391 | #define TRIE_STORE_REVCHAR(val) \ | |
392 | STMT_START { \ | |
393 | if (UTF) { \ | |
394 | SV *zlopp = newSV(UTF8_MAXBYTES); \ | |
395 | unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ | |
396 | unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ | |
397 | *kapow = '\0'; \ | |
398 | SvCUR_set(zlopp, kapow - flrbbbbb); \ | |
399 | SvPOK_on(zlopp); \ | |
400 | SvUTF8_on(zlopp); \ | |
401 | av_push_simple(revcharmap, zlopp); \ | |
402 | } else { \ | |
403 | char ooooff = (char)val; \ | |
404 | av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \ | |
405 | } \ | |
406 | } STMT_END | |
407 | ||
408 | /* This gets the next character from the input, folding it if not already | |
409 | * folded. */ | |
410 | #define TRIE_READ_CHAR STMT_START { \ | |
411 | wordlen++; \ | |
412 | if ( UTF ) { \ | |
413 | /* if it is UTF then it is either already folded, or does not need \ | |
414 | * folding */ \ | |
415 | uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ | |
416 | } \ | |
417 | else if (folder == PL_fold_latin1) { \ | |
418 | /* This folder implies Unicode rules, which in the range expressible \ | |
419 | * by not UTF is the lower case, with the two exceptions, one of \ | |
420 | * which should have been taken care of before calling this */ \ | |
421 | assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ | |
422 | uvc = toLOWER_L1(*uc); \ | |
423 | if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ | |
424 | len = 1; \ | |
425 | } else { \ | |
426 | /* raw data, will be folded later if needed */ \ | |
427 | uvc = (U32)*uc; \ | |
428 | len = 1; \ | |
429 | } \ | |
430 | } STMT_END | |
431 | ||
432 | ||
433 | ||
434 | #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ | |
435 | if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ | |
436 | U32 ging = TRIE_LIST_LEN( state ) * 2; \ | |
437 | Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ | |
438 | TRIE_LIST_LEN( state ) = ging; \ | |
439 | } \ | |
440 | TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ | |
441 | TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ | |
442 | TRIE_LIST_CUR( state )++; \ | |
443 | } STMT_END | |
444 | ||
445 | #define TRIE_LIST_NEW(state) STMT_START { \ | |
446 | Newx( trie->states[ state ].trans.list, \ | |
447 | 4, reg_trie_trans_le ); \ | |
448 | TRIE_LIST_CUR( state ) = 1; \ | |
449 | TRIE_LIST_LEN( state ) = 4; \ | |
450 | } STMT_END | |
451 | ||
452 | #define TRIE_HANDLE_WORD(state) STMT_START { \ | |
453 | U16 dupe= trie->states[ state ].wordnum; \ | |
454 | regnode * const noper_next = regnext( noper ); \ | |
455 | \ | |
456 | DEBUG_r({ \ | |
457 | /* store the word for dumping */ \ | |
458 | SV* tmp; \ | |
459 | if (OP(noper) != NOTHING) \ | |
460 | tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ | |
461 | else \ | |
462 | tmp = newSVpvn_utf8( "", 0, UTF ); \ | |
463 | av_push_simple( trie_words, tmp ); \ | |
464 | }); \ | |
465 | \ | |
466 | curword++; \ | |
467 | trie->wordinfo[curword].prev = 0; \ | |
468 | trie->wordinfo[curword].len = wordlen; \ | |
469 | trie->wordinfo[curword].accept = state; \ | |
470 | \ | |
471 | if ( noper_next < tail ) { \ | |
acababb4 | 472 | if (!trie->jump) { \ |
85900e28 YO |
473 | trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ |
474 | sizeof(U16) ); \ | |
acababb4 YO |
475 | trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \ |
476 | sizeof(U16) ); \ | |
477 | trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \ | |
478 | sizeof(U16) ); \ | |
479 | } \ | |
85900e28 | 480 | trie->jump[curword] = (U16)(noper_next - convert); \ |
acababb4 YO |
481 | U16 set_before_paren; \ |
482 | U16 set_after_paren; \ | |
483 | if (OP(cur) == BRANCH) { \ | |
17e3e02a YO |
484 | set_before_paren = ARG1a(cur); \ |
485 | set_after_paren = ARG1b(cur); \ | |
acababb4 | 486 | } else { \ |
17e3e02a YO |
487 | set_before_paren = ARG2a(cur); \ |
488 | set_after_paren = ARG2b(cur); \ | |
acababb4 YO |
489 | } \ |
490 | trie->j_before_paren[curword] = set_before_paren; \ | |
491 | trie->j_after_paren[curword] = set_after_paren; \ | |
85900e28 YO |
492 | if (!jumper) \ |
493 | jumper = noper_next; \ | |
494 | if (!nextbranch) \ | |
495 | nextbranch= regnext(cur); \ | |
496 | } \ | |
497 | \ | |
498 | if ( dupe ) { \ | |
499 | /* It's a dupe. Pre-insert into the wordinfo[].prev */\ | |
500 | /* chain, so that when the bits of chain are later */\ | |
501 | /* linked together, the dups appear in the chain */\ | |
502 | trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ | |
503 | trie->wordinfo[dupe].prev = curword; \ | |
504 | } else { \ | |
505 | /* we haven't inserted this word yet. */ \ | |
506 | trie->states[ state ].wordnum = curword; \ | |
507 | } \ | |
508 | } STMT_END | |
509 | ||
510 | ||
511 | #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ | |
512 | ( ( base + charid >= ucharcount \ | |
513 | && base + charid < ubound \ | |
514 | && state == trie->trans[ base - ucharcount + charid ].check \ | |
515 | && trie->trans[ base - ucharcount + charid ].next ) \ | |
516 | ? trie->trans[ base - ucharcount + charid ].next \ | |
517 | : ( state==1 ? special : 0 ) \ | |
518 | ) | |
519 | ||
520 | #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \ | |
521 | STMT_START { \ | |
522 | TRIE_BITMAP_SET(trie, uvc); \ | |
523 | /* store the folded codepoint */ \ | |
524 | if ( folder ) \ | |
525 | TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \ | |
526 | \ | |
527 | if ( !UTF ) { \ | |
528 | /* store first byte of utf8 representation of */ \ | |
529 | /* variant codepoints */ \ | |
530 | if (! UVCHR_IS_INVARIANT(uvc)) { \ | |
531 | TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \ | |
532 | } \ | |
533 | } \ | |
534 | } STMT_END | |
535 | ||
536 | I32 | |
537 | Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, | |
538 | regnode *first, regnode *last, regnode *tail, | |
539 | U32 word_count, U32 flags, U32 depth) | |
540 | { | |
541 | /* first pass, loop through and scan words */ | |
542 | reg_trie_data *trie; | |
543 | HV *widecharmap = NULL; | |
544 | AV *revcharmap = newAV(); | |
545 | regnode *cur; | |
546 | STRLEN len = 0; | |
547 | UV uvc = 0; | |
548 | U16 curword = 0; | |
549 | U32 next_alloc = 0; | |
550 | regnode *jumper = NULL; | |
551 | regnode *nextbranch = NULL; | |
acababb4 | 552 | regnode *lastbranch = NULL; |
85900e28 YO |
553 | regnode *convert = NULL; |
554 | U32 *prev_states; /* temp array mapping each state to previous one */ | |
555 | /* we just use folder as a flag in utf8 */ | |
556 | const U8 * folder = NULL; | |
557 | ||
558 | /* in the below reg_add_data call we are storing either 'tu' or 'tuaa' | |
559 | * which stands for one trie structure, one hash, optionally followed | |
560 | * by two arrays */ | |
561 | #ifdef DEBUGGING | |
562 | const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tuaa")); | |
563 | AV *trie_words = NULL; | |
564 | /* along with revcharmap, this only used during construction but both are | |
565 | * useful during debugging so we store them in the struct when debugging. | |
566 | */ | |
567 | #else | |
568 | const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tu")); | |
569 | STRLEN trie_charcount=0; | |
570 | #endif | |
571 | SV *re_trie_maxbuff; | |
572 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
573 | ||
574 | PERL_ARGS_ASSERT_MAKE_TRIE; | |
575 | #ifndef DEBUGGING | |
576 | PERL_UNUSED_ARG(depth); | |
577 | #endif | |
578 | ||
579 | switch (flags) { | |
580 | case EXACT: case EXACT_REQ8: case EXACTL: break; | |
581 | case EXACTFAA: | |
582 | case EXACTFUP: | |
583 | case EXACTFU: | |
584 | case EXACTFLU8: folder = PL_fold_latin1; break; | |
585 | case EXACTF: folder = PL_fold; break; | |
586 | default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) ); | |
587 | } | |
588 | ||
acababb4 | 589 | /* create the trie struct, all zeroed */ |
85900e28 YO |
590 | trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); |
591 | trie->refcount = 1; | |
592 | trie->startstate = 1; | |
593 | trie->wordcount = word_count; | |
594 | RExC_rxi->data->data[ data_slot ] = (void*)trie; | |
595 | trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); | |
596 | if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL) | |
597 | trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); | |
598 | trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( | |
599 | trie->wordcount+1, sizeof(reg_trie_wordinfo)); | |
600 | ||
601 | DEBUG_r({ | |
602 | trie_words = newAV(); | |
603 | }); | |
604 | ||
605 | re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD); | |
606 | assert(re_trie_maxbuff); | |
607 | if (!SvIOK(re_trie_maxbuff)) { | |
608 | sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); | |
609 | } | |
610 | DEBUG_TRIE_COMPILE_r({ | |
611 | Perl_re_indentf( aTHX_ | |
612 | "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", | |
613 | depth+1, | |
614 | REG_NODE_NUM(startbranch), REG_NODE_NUM(first), | |
615 | REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); | |
616 | }); | |
617 | ||
618 | /* Find the node we are going to overwrite */ | |
619 | if ( first == startbranch && OP( last ) != BRANCH ) { | |
620 | /* whole branch chain */ | |
621 | convert = first; | |
622 | } else { | |
623 | /* branch sub-chain */ | |
624 | convert = REGNODE_AFTER( first ); | |
625 | } | |
626 | ||
627 | /* -- First loop and Setup -- | |
628 | ||
629 | We first traverse the branches and scan each word to determine if it | |
630 | contains widechars, and how many unique chars there are, this is | |
631 | important as we have to build a table with at least as many columns as we | |
632 | have unique chars. | |
633 | ||
634 | We use an array of integers to represent the character codes 0..255 | |
635 | (trie->charmap) and we use a an HV* to store Unicode characters. We use | |
636 | the native representation of the character value as the key and IV's for | |
637 | the coded index. | |
638 | ||
639 | *TODO* If we keep track of how many times each character is used we can | |
640 | remap the columns so that the table compression later on is more | |
641 | efficient in terms of memory by ensuring the most common value is in the | |
642 | middle and the least common are on the outside. IMO this would be better | |
643 | than a most to least common mapping as theres a decent chance the most | |
644 | common letter will share a node with the least common, meaning the node | |
645 | will not be compressible. With a middle is most common approach the worst | |
646 | case is when we have the least common nodes twice. | |
647 | ||
648 | */ | |
649 | ||
650 | for ( cur = first ; cur < last ; cur = regnext( cur ) ) { | |
651 | regnode *noper = REGNODE_AFTER( cur ); | |
652 | const U8 *uc; | |
653 | const U8 *e; | |
654 | int foldlen = 0; | |
655 | U32 wordlen = 0; /* required init */ | |
656 | STRLEN minchars = 0; | |
657 | STRLEN maxchars = 0; | |
658 | bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the | |
659 | bitmap?*/ | |
acababb4 | 660 | lastbranch = cur; |
85900e28 YO |
661 | |
662 | if (OP(noper) == NOTHING) { | |
663 | /* skip past a NOTHING at the start of an alternation | |
664 | * eg, /(?:)a|(?:b)/ should be the same as /a|b/ | |
665 | * | |
666 | * If the next node is not something we are supposed to process | |
667 | * we will just ignore it due to the condition guarding the | |
668 | * next block. | |
669 | */ | |
670 | ||
671 | regnode *noper_next= regnext(noper); | |
672 | if (noper_next < tail) | |
673 | noper= noper_next; | |
674 | } | |
675 | ||
676 | if ( noper < tail | |
677 | && ( OP(noper) == flags | |
678 | || (flags == EXACT && OP(noper) == EXACT_REQ8) | |
679 | || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 | |
680 | || OP(noper) == EXACTFUP)))) | |
681 | { | |
682 | uc= (U8*)STRING(noper); | |
683 | e= uc + STR_LEN(noper); | |
684 | } else { | |
685 | trie->minlen= 0; | |
686 | continue; | |
687 | } | |
688 | ||
689 | ||
690 | if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ | |
691 | TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte | |
692 | regardless of encoding */ | |
693 | if (OP( noper ) == EXACTFUP) { | |
694 | /* false positives are ok, so just set this */ | |
695 | TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); | |
696 | } | |
697 | } | |
698 | ||
699 | for ( ; uc < e ; uc += len ) { /* Look at each char in the current | |
700 | branch */ | |
701 | TRIE_CHARCOUNT(trie)++; | |
702 | TRIE_READ_CHAR; | |
703 | ||
704 | /* TRIE_READ_CHAR returns the current character, or its fold if /i | |
705 | * is in effect. Under /i, this character can match itself, or | |
706 | * anything that folds to it. If not under /i, it can match just | |
707 | * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN | |
708 | * all fold to k, and all are single characters. But some folds | |
709 | * expand to more than one character, so for example LATIN SMALL | |
710 | * LIGATURE FFI folds to the three character sequence 'ffi'. If | |
711 | * the string beginning at 'uc' is 'ffi', it could be matched by | |
712 | * three characters, or just by the one ligature character. (It | |
713 | * could also be matched by two characters: LATIN SMALL LIGATURE FF | |
714 | * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). | |
715 | * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also | |
716 | * match.) The trie needs to know the minimum and maximum number | |
717 | * of characters that could match so that it can use size alone to | |
718 | * quickly reject many match attempts. The max is simple: it is | |
719 | * the number of folded characters in this branch (since a fold is | |
720 | * never shorter than what folds to it. */ | |
721 | ||
722 | maxchars++; | |
723 | ||
724 | /* And the min is equal to the max if not under /i (indicated by | |
725 | * 'folder' being NULL), or there are no multi-character folds. If | |
726 | * there is a multi-character fold, the min is incremented just | |
727 | * once, for the character that folds to the sequence. Each | |
728 | * character in the sequence needs to be added to the list below of | |
729 | * characters in the trie, but we count only the first towards the | |
730 | * min number of characters needed. This is done through the | |
731 | * variable 'foldlen', which is returned by the macros that look | |
732 | * for these sequences as the number of bytes the sequence | |
733 | * occupies. Each time through the loop, we decrement 'foldlen' by | |
734 | * how many bytes the current char occupies. Only when it reaches | |
735 | * 0 do we increment 'minchars' or look for another multi-character | |
736 | * sequence. */ | |
737 | if (folder == NULL) { | |
738 | minchars++; | |
739 | } | |
740 | else if (foldlen > 0) { | |
741 | foldlen -= (UTF) ? UTF8SKIP(uc) : 1; | |
742 | } | |
743 | else { | |
744 | minchars++; | |
745 | ||
746 | /* See if *uc is the beginning of a multi-character fold. If | |
747 | * so, we decrement the length remaining to look at, to account | |
748 | * for the current character this iteration. (We can use 'uc' | |
749 | * instead of the fold returned by TRIE_READ_CHAR because the | |
750 | * macro is smart enough to account for any unfolded | |
751 | * characters. */ | |
752 | if (UTF) { | |
753 | if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { | |
754 | foldlen -= UTF8SKIP(uc); | |
755 | } | |
756 | } | |
757 | else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { | |
758 | foldlen--; | |
759 | } | |
760 | } | |
761 | ||
762 | /* The current character (and any potential folds) should be added | |
763 | * to the possible matching characters for this position in this | |
764 | * branch */ | |
765 | if ( uvc < 256 ) { | |
766 | if ( folder ) { | |
767 | U8 folded= folder[ (U8) uvc ]; | |
768 | if ( !trie->charmap[ folded ] ) { | |
769 | trie->charmap[ folded ]=( ++trie->uniquecharcount ); | |
770 | TRIE_STORE_REVCHAR( folded ); | |
771 | } | |
772 | } | |
773 | if ( !trie->charmap[ uvc ] ) { | |
774 | trie->charmap[ uvc ]=( ++trie->uniquecharcount ); | |
775 | TRIE_STORE_REVCHAR( uvc ); | |
776 | } | |
777 | if ( set_bit ) { | |
778 | /* store the codepoint in the bitmap, and its folded | |
779 | * equivalent. */ | |
780 | TRIE_BITMAP_SET_FOLDED(trie, uvc, folder); | |
781 | set_bit = 0; /* We've done our bit :-) */ | |
782 | } | |
783 | } else { | |
784 | ||
785 | /* XXX We could come up with the list of code points that fold | |
786 | * to this using PL_utf8_foldclosures, except not for | |
787 | * multi-char folds, as there may be multiple combinations | |
788 | * there that could work, which needs to wait until runtime to | |
789 | * resolve (The comment about LIGATURE FFI above is such an | |
790 | * example */ | |
791 | ||
792 | SV** svpp; | |
793 | if ( !widecharmap ) | |
794 | widecharmap = newHV(); | |
795 | ||
796 | svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); | |
797 | ||
798 | if ( !svpp ) | |
799 | Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc ); | |
800 | ||
801 | if ( !SvTRUE( *svpp ) ) { | |
802 | sv_setiv( *svpp, ++trie->uniquecharcount ); | |
803 | TRIE_STORE_REVCHAR(uvc); | |
804 | } | |
805 | } | |
806 | } /* end loop through characters in this branch of the trie */ | |
807 | ||
808 | /* We take the min and max for this branch and combine to find the min | |
809 | * and max for all branches processed so far */ | |
810 | if( cur == first ) { | |
811 | trie->minlen = minchars; | |
812 | trie->maxlen = maxchars; | |
813 | } else if (minchars < trie->minlen) { | |
814 | trie->minlen = minchars; | |
815 | } else if (maxchars > trie->maxlen) { | |
816 | trie->maxlen = maxchars; | |
817 | } | |
818 | } /* end first pass */ | |
acababb4 | 819 | trie->before_paren = OP(first) == BRANCH |
17e3e02a YO |
820 | ? ARG1a(first) |
821 | : ARG2a(first); /* BRANCHJ */ | |
acababb4 YO |
822 | |
823 | trie->after_paren = OP(lastbranch) == BRANCH | |
17e3e02a YO |
824 | ? ARG1b(lastbranch) |
825 | : ARG2b(lastbranch); /* BRANCHJ */ | |
85900e28 YO |
826 | DEBUG_TRIE_COMPILE_r( |
827 | Perl_re_indentf( aTHX_ | |
828 | "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", | |
829 | depth+1, | |
830 | ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, | |
831 | (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, | |
832 | (int)trie->minlen, (int)trie->maxlen ) | |
833 | ); | |
834 | ||
835 | /* | |
836 | We now know what we are dealing with in terms of unique chars and | |
837 | string sizes so we can calculate how much memory a naive | |
838 | representation using a flat table will take. If it's over a reasonable | |
839 | limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory | |
840 | conservative but potentially much slower representation using an array | |
841 | of lists. | |
842 | ||
843 | At the end we convert both representations into the same compressed | |
844 | form that will be used in regexec.c for matching with. The latter | |
845 | is a form that cannot be used to construct with but has memory | |
846 | properties similar to the list form and access properties similar | |
847 | to the table form making it both suitable for fast searches and | |
848 | small enough that its feasable to store for the duration of a program. | |
849 | ||
850 | See the comment in the code where the compressed table is produced | |
851 | inplace from the flat tabe representation for an explanation of how | |
852 | the compression works. | |
853 | ||
854 | */ | |
855 | ||
856 | ||
857 | Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); | |
858 | prev_states[1] = 0; | |
859 | ||
860 | if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) | |
861 | > SvIV(re_trie_maxbuff) ) | |
862 | { | |
863 | /* | |
864 | Second Pass -- Array Of Lists Representation | |
865 | ||
866 | Each state will be represented by a list of charid:state records | |
867 | (reg_trie_trans_le) the first such element holds the CUR and LEN | |
868 | points of the allocated array. (See defines above). | |
869 | ||
870 | We build the initial structure using the lists, and then convert | |
871 | it into the compressed table form which allows faster lookups | |
872 | (but cant be modified once converted). | |
873 | */ | |
874 | ||
875 | STRLEN transcount = 1; | |
876 | ||
877 | DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", | |
878 | depth+1)); | |
879 | ||
880 | trie->states = (reg_trie_state *) | |
881 | PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, | |
882 | sizeof(reg_trie_state) ); | |
883 | TRIE_LIST_NEW(1); | |
884 | next_alloc = 2; | |
885 | ||
886 | for ( cur = first ; cur < last ; cur = regnext( cur ) ) { | |
887 | ||
888 | regnode *noper = REGNODE_AFTER( cur ); | |
889 | U32 state = 1; /* required init */ | |
890 | U16 charid = 0; /* sanity init */ | |
891 | U32 wordlen = 0; /* required init */ | |
892 | ||
893 | if (OP(noper) == NOTHING) { | |
894 | regnode *noper_next= regnext(noper); | |
895 | if (noper_next < tail) | |
896 | noper= noper_next; | |
897 | /* we will undo this assignment if noper does not | |
898 | * point at a trieable type in the else clause of | |
899 | * the following statement. */ | |
900 | } | |
901 | ||
902 | if ( noper < tail | |
903 | && ( OP(noper) == flags | |
904 | || (flags == EXACT && OP(noper) == EXACT_REQ8) | |
905 | || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 | |
906 | || OP(noper) == EXACTFUP)))) | |
907 | { | |
908 | const U8 *uc= (U8*)STRING(noper); | |
909 | const U8 *e= uc + STR_LEN(noper); | |
910 | ||
911 | for ( ; uc < e ; uc += len ) { | |
912 | ||
913 | TRIE_READ_CHAR; | |
914 | ||
915 | if ( uvc < 256 ) { | |
916 | charid = trie->charmap[ uvc ]; | |
917 | } else { | |
918 | SV** const svpp = hv_fetch( widecharmap, | |
919 | (char*)&uvc, | |
920 | sizeof( UV ), | |
921 | 0); | |
922 | if ( !svpp ) { | |
923 | charid = 0; | |
924 | } else { | |
925 | charid=(U16)SvIV( *svpp ); | |
926 | } | |
927 | } | |
928 | /* charid is now 0 if we dont know the char read, or | |
929 | * nonzero if we do */ | |
930 | if ( charid ) { | |
931 | ||
932 | U16 check; | |
933 | U32 newstate = 0; | |
934 | ||
935 | charid--; | |
936 | if ( !trie->states[ state ].trans.list ) { | |
937 | TRIE_LIST_NEW( state ); | |
938 | } | |
939 | for ( check = 1; | |
940 | check <= TRIE_LIST_USED( state ); | |
941 | check++ ) | |
942 | { | |
943 | if ( TRIE_LIST_ITEM( state, check ).forid | |
944 | == charid ) | |
945 | { | |
946 | newstate = TRIE_LIST_ITEM( state, check ).newstate; | |
947 | break; | |
948 | } | |
949 | } | |
950 | if ( ! newstate ) { | |
951 | newstate = next_alloc++; | |
952 | prev_states[newstate] = state; | |
953 | TRIE_LIST_PUSH( state, charid, newstate ); | |
954 | transcount++; | |
955 | } | |
956 | state = newstate; | |
957 | } else { | |
958 | Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); | |
959 | } | |
960 | } | |
961 | } else { | |
962 | /* If we end up here it is because we skipped past a NOTHING, but did not end up | |
963 | * on a trieable type. So we need to reset noper back to point at the first regop | |
964 | * in the branch before we call TRIE_HANDLE_WORD() | |
965 | */ | |
966 | noper= REGNODE_AFTER(cur); | |
967 | } | |
968 | TRIE_HANDLE_WORD(state); | |
969 | ||
970 | } /* end second pass */ | |
971 | ||
972 | /* next alloc is the NEXT state to be allocated */ | |
973 | trie->statecount = next_alloc; | |
974 | trie->states = (reg_trie_state *) | |
975 | PerlMemShared_realloc( trie->states, | |
976 | next_alloc | |
977 | * sizeof(reg_trie_state) ); | |
978 | ||
979 | /* and now dump it out before we compress it */ | |
980 | DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, | |
981 | revcharmap, next_alloc, | |
982 | depth+1) | |
983 | ); | |
984 | ||
985 | trie->trans = (reg_trie_trans *) | |
986 | PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); | |
987 | { | |
988 | U32 state; | |
989 | U32 tp = 0; | |
990 | U32 zp = 0; | |
991 | ||
992 | ||
993 | for( state=1 ; state < next_alloc ; state ++ ) { | |
994 | U32 base=0; | |
995 | ||
996 | /* | |
997 | DEBUG_TRIE_COMPILE_MORE_r( | |
998 | Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) | |
999 | ); | |
1000 | */ | |
1001 | ||
1002 | if (trie->states[state].trans.list) { | |
1003 | U16 minid=TRIE_LIST_ITEM( state, 1).forid; | |
1004 | U16 maxid=minid; | |
1005 | U16 idx; | |
1006 | ||
1007 | for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { | |
1008 | const U16 forid = TRIE_LIST_ITEM( state, idx).forid; | |
1009 | if ( forid < minid ) { | |
1010 | minid=forid; | |
1011 | } else if ( forid > maxid ) { | |
1012 | maxid=forid; | |
1013 | } | |
1014 | } | |
1015 | if ( transcount < tp + maxid - minid + 1) { | |
1016 | transcount *= 2; | |
1017 | trie->trans = (reg_trie_trans *) | |
1018 | PerlMemShared_realloc( trie->trans, | |
1019 | transcount | |
1020 | * sizeof(reg_trie_trans) ); | |
1021 | Zero( trie->trans + (transcount / 2), | |
1022 | transcount / 2, | |
1023 | reg_trie_trans ); | |
1024 | } | |
1025 | base = trie->uniquecharcount + tp - minid; | |
1026 | if ( maxid == minid ) { | |
1027 | U32 set = 0; | |
1028 | for ( ; zp < tp ; zp++ ) { | |
1029 | if ( ! trie->trans[ zp ].next ) { | |
1030 | base = trie->uniquecharcount + zp - minid; | |
1031 | trie->trans[ zp ].next = TRIE_LIST_ITEM( state, | |
1032 | 1).newstate; | |
1033 | trie->trans[ zp ].check = state; | |
1034 | set = 1; | |
1035 | break; | |
1036 | } | |
1037 | } | |
1038 | if ( !set ) { | |
1039 | trie->trans[ tp ].next = TRIE_LIST_ITEM( state, | |
1040 | 1).newstate; | |
1041 | trie->trans[ tp ].check = state; | |
1042 | tp++; | |
1043 | zp = tp; | |
1044 | } | |
1045 | } else { | |
1046 | for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { | |
1047 | const U32 tid = base | |
1048 | - trie->uniquecharcount | |
1049 | + TRIE_LIST_ITEM( state, idx ).forid; | |
1050 | trie->trans[ tid ].next = TRIE_LIST_ITEM( state, | |
1051 | idx ).newstate; | |
1052 | trie->trans[ tid ].check = state; | |
1053 | } | |
1054 | tp += ( maxid - minid + 1 ); | |
1055 | } | |
1056 | Safefree(trie->states[ state ].trans.list); | |
1057 | } | |
1058 | /* | |
1059 | DEBUG_TRIE_COMPILE_MORE_r( | |
1060 | Perl_re_printf( aTHX_ " base: %d\n",base); | |
1061 | ); | |
1062 | */ | |
1063 | trie->states[ state ].trans.base=base; | |
1064 | } | |
1065 | trie->lasttrans = tp + 1; | |
1066 | } | |
1067 | } else { | |
1068 | /* | |
1069 | Second Pass -- Flat Table Representation. | |
1070 | ||
1071 | we dont use the 0 slot of either trans[] or states[] so we add 1 to | |
1072 | each. We know that we will need Charcount+1 trans at most to store | |
1073 | the data (one row per char at worst case) So we preallocate both | |
1074 | structures assuming worst case. | |
1075 | ||
1076 | We then construct the trie using only the .next slots of the entry | |
1077 | structs. | |
1078 | ||
1079 | We use the .check field of the first entry of the node temporarily | |
1080 | to make compression both faster and easier by keeping track of how | |
1081 | many non zero fields are in the node. | |
1082 | ||
1083 | Since trans are numbered from 1 any 0 pointer in the table is a FAIL | |
1084 | transition. | |
1085 | ||
1086 | There are two terms at use here: state as a TRIE_NODEIDX() which is | |
1087 | a number representing the first entry of the node, and state as a | |
1088 | TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) | |
1089 | and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) | |
1090 | if there are 2 entrys per node. eg: | |
1091 | ||
1092 | A B A B | |
1093 | 1. 2 4 1. 3 7 | |
1094 | 2. 0 3 3. 0 5 | |
1095 | 3. 0 0 5. 0 0 | |
1096 | 4. 0 0 7. 0 0 | |
1097 | ||
1098 | The table is internally in the right hand, idx form. However as we | |
1099 | also have to deal with the states array which is indexed by nodenum | |
1100 | we have to use TRIE_NODENUM() to convert. | |
1101 | ||
1102 | */ | |
1103 | DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", | |
1104 | depth+1)); | |
1105 | ||
1106 | trie->trans = (reg_trie_trans *) | |
1107 | PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) | |
1108 | * trie->uniquecharcount + 1, | |
1109 | sizeof(reg_trie_trans) ); | |
1110 | trie->states = (reg_trie_state *) | |
1111 | PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, | |
1112 | sizeof(reg_trie_state) ); | |
1113 | next_alloc = trie->uniquecharcount + 1; | |
1114 | ||
1115 | ||
1116 | for ( cur = first ; cur < last ; cur = regnext( cur ) ) { | |
1117 | ||
1118 | regnode *noper = REGNODE_AFTER( cur ); | |
1119 | ||
1120 | U32 state = 1; /* required init */ | |
1121 | ||
1122 | U16 charid = 0; /* sanity init */ | |
1123 | U32 accept_state = 0; /* sanity init */ | |
1124 | ||
1125 | U32 wordlen = 0; /* required init */ | |
1126 | ||
1127 | if (OP(noper) == NOTHING) { | |
1128 | regnode *noper_next= regnext(noper); | |
1129 | if (noper_next < tail) | |
1130 | noper= noper_next; | |
1131 | /* we will undo this assignment if noper does not | |
1132 | * point at a trieable type in the else clause of | |
1133 | * the following statement. */ | |
1134 | } | |
1135 | ||
1136 | if ( noper < tail | |
1137 | && ( OP(noper) == flags | |
1138 | || (flags == EXACT && OP(noper) == EXACT_REQ8) | |
1139 | || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 | |
1140 | || OP(noper) == EXACTFUP)))) | |
1141 | { | |
1142 | const U8 *uc= (U8*)STRING(noper); | |
1143 | const U8 *e= uc + STR_LEN(noper); | |
1144 | ||
1145 | for ( ; uc < e ; uc += len ) { | |
1146 | ||
1147 | TRIE_READ_CHAR; | |
1148 | ||
1149 | if ( uvc < 256 ) { | |
1150 | charid = trie->charmap[ uvc ]; | |
1151 | } else { | |
1152 | SV* const * const svpp = hv_fetch( widecharmap, | |
1153 | (char*)&uvc, | |
1154 | sizeof( UV ), | |
1155 | 0); | |
1156 | charid = svpp ? (U16)SvIV(*svpp) : 0; | |
1157 | } | |
1158 | if ( charid ) { | |
1159 | charid--; | |
1160 | if ( !trie->trans[ state + charid ].next ) { | |
1161 | trie->trans[ state + charid ].next = next_alloc; | |
1162 | trie->trans[ state ].check++; | |
1163 | prev_states[TRIE_NODENUM(next_alloc)] | |
1164 | = TRIE_NODENUM(state); | |
1165 | next_alloc += trie->uniquecharcount; | |
1166 | } | |
1167 | state = trie->trans[ state + charid ].next; | |
1168 | } else { | |
1169 | Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); | |
1170 | } | |
1171 | /* charid is now 0 if we dont know the char read, or | |
1172 | * nonzero if we do */ | |
1173 | } | |
1174 | } else { | |
1175 | /* If we end up here it is because we skipped past a NOTHING, but did not end up | |
1176 | * on a trieable type. So we need to reset noper back to point at the first regop | |
1177 | * in the branch before we call TRIE_HANDLE_WORD(). | |
1178 | */ | |
1179 | noper= REGNODE_AFTER(cur); | |
1180 | } | |
1181 | accept_state = TRIE_NODENUM( state ); | |
1182 | TRIE_HANDLE_WORD(accept_state); | |
1183 | ||
1184 | } /* end second pass */ | |
1185 | ||
1186 | /* and now dump it out before we compress it */ | |
1187 | DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, | |
1188 | revcharmap, | |
1189 | next_alloc, depth+1)); | |
1190 | ||
1191 | { | |
1192 | /* | |
1193 | * Inplace compress the table.* | |
1194 | ||
1195 | For sparse data sets the table constructed by the trie algorithm will | |
1196 | be mostly 0/FAIL transitions or to put it another way mostly empty. | |
1197 | (Note that leaf nodes will not contain any transitions.) | |
1198 | ||
1199 | This algorithm compresses the tables by eliminating most such | |
1200 | transitions, at the cost of a modest bit of extra work during lookup: | |
1201 | ||
1202 | - Each states[] entry contains a .base field which indicates the | |
1203 | index in the state[] array wheres its transition data is stored. | |
1204 | ||
1205 | - If .base is 0 there are no valid transitions from that node. | |
1206 | ||
1207 | - If .base is nonzero then charid is added to it to find an entry in | |
1208 | the trans array. | |
1209 | ||
1210 | -If trans[states[state].base+charid].check!=state then the | |
1211 | transition is taken to be a 0/Fail transition. Thus if there are fail | |
1212 | transitions at the front of the node then the .base offset will point | |
1213 | somewhere inside the previous nodes data (or maybe even into a node | |
1214 | even earlier), but the .check field determines if the transition is | |
1215 | valid. | |
1216 | ||
1217 | XXX - wrong maybe? | |
1218 | The following process inplace converts the table to the compressed | |
1219 | table: We first do not compress the root node 1,and mark all its | |
1220 | .check pointers as 1 and set its .base pointer as 1 as well. This | |
1221 | allows us to do a DFA construction from the compressed table later, | |
1222 | and ensures that any .base pointers we calculate later are greater | |
1223 | than 0. | |
1224 | ||
1225 | - We set 'pos' to indicate the first entry of the second node. | |
1226 | ||
1227 | - We then iterate over the columns of the node, finding the first and | |
1228 | last used entry at l and m. We then copy l..m into pos..(pos+m-l), | |
1229 | and set the .check pointers accordingly, and advance pos | |
1230 | appropriately and repreat for the next node. Note that when we copy | |
1231 | the next pointers we have to convert them from the original | |
1232 | NODEIDX form to NODENUM form as the former is not valid post | |
1233 | compression. | |
1234 | ||
1235 | - If a node has no transitions used we mark its base as 0 and do not | |
1236 | advance the pos pointer. | |
1237 | ||
1238 | - If a node only has one transition we use a second pointer into the | |
1239 | structure to fill in allocated fail transitions from other states. | |
1240 | This pointer is independent of the main pointer and scans forward | |
1241 | looking for null transitions that are allocated to a state. When it | |
1242 | finds one it writes the single transition into the "hole". If the | |
1243 | pointer doesnt find one the single transition is appended as normal. | |
1244 | ||
1245 | - Once compressed we can Renew/realloc the structures to release the | |
1246 | excess space. | |
1247 | ||
1248 | See "Table-Compression Methods" in sec 3.9 of the Red Dragon, | |
1249 | specifically Fig 3.47 and the associated pseudocode. | |
1250 | ||
1251 | demq | |
1252 | */ | |
1253 | const U32 laststate = TRIE_NODENUM( next_alloc ); | |
1254 | U32 state, charid; | |
1255 | U32 pos = 0, zp=0; | |
1256 | trie->statecount = laststate; | |
1257 | ||
1258 | for ( state = 1 ; state < laststate ; state++ ) { | |
1259 | U8 flag = 0; | |
1260 | const U32 stateidx = TRIE_NODEIDX( state ); | |
1261 | const U32 o_used = trie->trans[ stateidx ].check; | |
1262 | U32 used = trie->trans[ stateidx ].check; | |
1263 | trie->trans[ stateidx ].check = 0; | |
1264 | ||
1265 | for ( charid = 0; | |
1266 | used && charid < trie->uniquecharcount; | |
1267 | charid++ ) | |
1268 | { | |
1269 | if ( flag || trie->trans[ stateidx + charid ].next ) { | |
1270 | if ( trie->trans[ stateidx + charid ].next ) { | |
1271 | if (o_used == 1) { | |
1272 | for ( ; zp < pos ; zp++ ) { | |
1273 | if ( ! trie->trans[ zp ].next ) { | |
1274 | break; | |
1275 | } | |
1276 | } | |
1277 | trie->states[ state ].trans.base | |
1278 | = zp | |
1279 | + trie->uniquecharcount | |
1280 | - charid ; | |
1281 | trie->trans[ zp ].next | |
1282 | = SAFE_TRIE_NODENUM( trie->trans[ stateidx | |
1283 | + charid ].next ); | |
1284 | trie->trans[ zp ].check = state; | |
1285 | if ( ++zp > pos ) pos = zp; | |
1286 | break; | |
1287 | } | |
1288 | used--; | |
1289 | } | |
1290 | if ( !flag ) { | |
1291 | flag = 1; | |
1292 | trie->states[ state ].trans.base | |
1293 | = pos + trie->uniquecharcount - charid ; | |
1294 | } | |
1295 | trie->trans[ pos ].next | |
1296 | = SAFE_TRIE_NODENUM( | |
1297 | trie->trans[ stateidx + charid ].next ); | |
1298 | trie->trans[ pos ].check = state; | |
1299 | pos++; | |
1300 | } | |
1301 | } | |
1302 | } | |
1303 | trie->lasttrans = pos + 1; | |
1304 | trie->states = (reg_trie_state *) | |
1305 | PerlMemShared_realloc( trie->states, laststate | |
1306 | * sizeof(reg_trie_state) ); | |
1307 | DEBUG_TRIE_COMPILE_MORE_r( | |
1308 | Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n", | |
1309 | depth+1, | |
1310 | (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount | |
1311 | + 1 ), | |
1312 | (IV)next_alloc, | |
1313 | (IV)pos, | |
1314 | ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); | |
1315 | ); | |
1316 | ||
1317 | } /* end table compress */ | |
1318 | } | |
1319 | DEBUG_TRIE_COMPILE_MORE_r( | |
1320 | Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n", | |
1321 | depth+1, | |
1322 | (UV)trie->statecount, | |
1323 | (UV)trie->lasttrans) | |
1324 | ); | |
1325 | /* resize the trans array to remove unused space */ | |
1326 | trie->trans = (reg_trie_trans *) | |
1327 | PerlMemShared_realloc( trie->trans, trie->lasttrans | |
1328 | * sizeof(reg_trie_trans) ); | |
1329 | ||
1330 | { /* Modify the program and insert the new TRIE node */ | |
1331 | U8 nodetype =(U8) flags; | |
1332 | char *str=NULL; | |
1333 | ||
1334 | #ifdef DEBUGGING | |
1335 | regnode *optimize = NULL; | |
1336 | #endif /* DEBUGGING */ | |
3645ca4e YO |
1337 | /* make sure we have enough room to inject the TRIE op */ |
1338 | assert((!trie->jump) || !trie->jump[1] || | |
1339 | (trie->jump[1] >= (sizeof(tregnode_TRIE)/sizeof(struct regnode)))); | |
85900e28 YO |
1340 | /* |
1341 | This means we convert either the first branch or the first Exact, | |
1342 | depending on whether the thing following (in 'last') is a branch | |
1343 | or not and whther first is the startbranch (ie is it a sub part of | |
1344 | the alternation or is it the whole thing.) | |
1345 | Assuming its a sub part we convert the EXACT otherwise we convert | |
1346 | the whole branch sequence, including the first. | |
1347 | */ | |
1348 | /* Find the node we are going to overwrite */ | |
1349 | if ( first != startbranch || OP( last ) == BRANCH ) { | |
1350 | /* branch sub-chain */ | |
1351 | NEXT_OFF( first ) = (U16)(last - first); | |
1352 | /* whole branch chain */ | |
1353 | } | |
1354 | /* But first we check to see if there is a common prefix we can | |
1355 | split out as an EXACT and put in front of the TRIE node. */ | |
1356 | trie->startstate= 1; | |
1357 | if ( trie->bitmap && !widecharmap && !trie->jump ) { | |
1358 | /* we want to find the first state that has more than | |
1359 | * one transition, if that state is not the first state | |
1360 | * then we have a common prefix which we can remove. | |
1361 | */ | |
1362 | U32 state; | |
1363 | for ( state = 1 ; state < trie->statecount-1 ; state++ ) { | |
1364 | U32 ofs = 0; | |
1365 | I32 first_ofs = -1; /* keeps track of the ofs of the first | |
1366 | transition, -1 means none */ | |
1367 | U32 count = 0; | |
1368 | const U32 base = trie->states[ state ].trans.base; | |
1369 | ||
1370 | /* does this state terminate an alternation? */ | |
1371 | if ( trie->states[state].wordnum ) | |
1372 | count = 1; | |
1373 | ||
1374 | for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { | |
1375 | if ( ( base + ofs >= trie->uniquecharcount ) && | |
1376 | ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && | |
1377 | trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) | |
1378 | { | |
1379 | if ( ++count > 1 ) { | |
1380 | /* we have more than one transition */ | |
1381 | SV **tmp; | |
1382 | U8 *ch; | |
1383 | /* if this is the first state there is no common prefix | |
1384 | * to extract, so we can exit */ | |
1385 | if ( state == 1 ) break; | |
1386 | tmp = av_fetch_simple( revcharmap, ofs, 0); | |
1387 | ch = (U8*)SvPV_nolen_const( *tmp ); | |
1388 | ||
1389 | /* if we are on count 2 then we need to initialize the | |
1390 | * bitmap, and store the previous char if there was one | |
1391 | * in it*/ | |
1392 | if ( count == 2 ) { | |
1393 | /* clear the bitmap */ | |
1394 | Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); | |
1395 | DEBUG_OPTIMISE_r( | |
1396 | Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [", | |
1397 | depth+1, | |
1398 | (UV)state)); | |
1399 | if (first_ofs >= 0) { | |
1400 | SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0); | |
1401 | const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); | |
1402 | ||
1403 | TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); | |
1404 | DEBUG_OPTIMISE_r( | |
1405 | Perl_re_printf( aTHX_ "%s", (char*)ch) | |
1406 | ); | |
1407 | } | |
1408 | } | |
1409 | /* store the current firstchar in the bitmap */ | |
1410 | TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); | |
1411 | DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); | |
1412 | } | |
1413 | first_ofs = ofs; | |
1414 | } | |
1415 | } | |
1416 | if ( count == 1 ) { | |
1417 | /* This state has only one transition, its transition is part | |
1418 | * of a common prefix - we need to concatenate the char it | |
1419 | * represents to what we have so far. */ | |
1420 | SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0); | |
1421 | STRLEN len; | |
1422 | char *ch = SvPV( *tmp, len ); | |
1423 | DEBUG_OPTIMISE_r({ | |
1424 | SV *sv=sv_newmortal(); | |
1425 | Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n", | |
1426 | depth+1, | |
1427 | (UV)state, (UV)first_ofs, | |
1428 | pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, | |
1429 | PL_colors[0], PL_colors[1], | |
1430 | (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | | |
1431 | PERL_PV_ESCAPE_FIRSTCHAR | |
1432 | ) | |
1433 | ); | |
1434 | }); | |
1435 | if ( state==1 ) { | |
1436 | OP( convert ) = nodetype; | |
1437 | str=STRING(convert); | |
1438 | setSTR_LEN(convert, 0); | |
1439 | } | |
1440 | assert( ( STR_LEN(convert) + len ) < 256 ); | |
1441 | setSTR_LEN(convert, (U8)(STR_LEN(convert) + len)); | |
1442 | while (len--) | |
1443 | *str++ = *ch++; | |
1444 | } else { | |
1445 | #ifdef DEBUGGING | |
1446 | if (state>1) | |
1447 | DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); | |
1448 | #endif | |
1449 | break; | |
1450 | } | |
1451 | } | |
1452 | trie->prefixlen = (state-1); | |
1453 | if (str) { | |
1454 | regnode *n = REGNODE_AFTER(convert); | |
1455 | assert( n - convert <= U16_MAX ); | |
1456 | NEXT_OFF(convert) = n - convert; | |
1457 | trie->startstate = state; | |
1458 | trie->minlen -= (state - 1); | |
1459 | trie->maxlen -= (state - 1); | |
1460 | #ifdef DEBUGGING | |
1461 | /* At least the UNICOS C compiler choked on this | |
1462 | * being argument to DEBUG_r(), so let's just have | |
1463 | * it right here. */ | |
1464 | if ( | |
1465 | #ifdef PERL_EXT_RE_BUILD | |
1466 | 1 | |
1467 | #else | |
1468 | DEBUG_r_TEST | |
1469 | #endif | |
1470 | ) { | |
1471 | U32 word = trie->wordcount; | |
1472 | while (word--) { | |
1473 | SV ** const tmp = av_fetch_simple( trie_words, word, 0 ); | |
1474 | if (tmp) { | |
1475 | if ( STR_LEN(convert) <= SvCUR(*tmp) ) | |
1476 | sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); | |
1477 | else | |
1478 | sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); | |
1479 | } | |
1480 | } | |
1481 | } | |
1482 | #endif | |
1483 | if (trie->maxlen) { | |
1484 | convert = n; | |
1485 | } else { | |
1486 | NEXT_OFF(convert) = (U16)(tail - convert); | |
1487 | DEBUG_r(optimize= n); | |
1488 | } | |
1489 | } | |
1490 | } | |
1491 | if (!jumper) | |
1492 | jumper = last; | |
1493 | if ( trie->maxlen ) { | |
1494 | NEXT_OFF( convert ) = (U16)(tail - convert); | |
17e3e02a | 1495 | ARG1u_SET( convert, data_slot ); |
85900e28 YO |
1496 | /* Store the offset to the first unabsorbed branch in |
1497 | jump[0], which is otherwise unused by the jump logic. | |
1498 | We use this when dumping a trie and during optimisation. */ | |
1499 | if (trie->jump) | |
1500 | trie->jump[0] = (U16)(nextbranch - convert); | |
1501 | ||
1502 | /* If the start state is not accepting (meaning there is no empty string/NOTHING) | |
1503 | * and there is a bitmap | |
1504 | * and the first "jump target" node we found leaves enough room | |
1505 | * then convert the TRIE node into a TRIEC node, with the bitmap | |
1506 | * embedded inline in the opcode - this is hypothetically faster. | |
1507 | */ | |
1508 | if ( !trie->states[trie->startstate].wordnum | |
1509 | && trie->bitmap | |
3645ca4e | 1510 | && ( (char *)jumper - (char *)convert) >= (int)sizeof(tregnode_TRIEC) ) |
85900e28 YO |
1511 | { |
1512 | OP( convert ) = TRIEC; | |
3645ca4e | 1513 | Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); |
85900e28 YO |
1514 | PerlMemShared_free(trie->bitmap); |
1515 | trie->bitmap= NULL; | |
1516 | } else | |
1517 | OP( convert ) = TRIE; | |
1518 | ||
1519 | /* store the type in the flags */ | |
44eb4cdc | 1520 | FLAGS(convert) = nodetype; |
85900e28 YO |
1521 | DEBUG_r({ |
1522 | optimize = convert | |
1523 | + NODE_STEP_REGNODE | |
1524 | + REGNODE_ARG_LEN( OP( convert ) ); | |
1525 | }); | |
1526 | /* XXX We really should free up the resource in trie now, | |
1527 | as we won't use them - (which resources?) dmq */ | |
1528 | } | |
1529 | /* needed for dumping*/ | |
1530 | DEBUG_r(if (optimize) { | |
1531 | /* | |
1532 | Try to clean up some of the debris left after the | |
1533 | optimisation. | |
1534 | */ | |
1535 | while( optimize < jumper ) { | |
1536 | OP( optimize ) = OPTIMIZED; | |
1537 | optimize++; | |
1538 | } | |
1539 | }); | |
1540 | } /* end node insert */ | |
1541 | ||
1542 | /* Finish populating the prev field of the wordinfo array. Walk back | |
1543 | * from each accept state until we find another accept state, and if | |
1544 | * so, point the first word's .prev field at the second word. If the | |
1545 | * second already has a .prev field set, stop now. This will be the | |
1546 | * case either if we've already processed that word's accept state, | |
1547 | * or that state had multiple words, and the overspill words were | |
1548 | * already linked up earlier. | |
1549 | */ | |
1550 | { | |
1551 | U16 word; | |
1552 | U32 state; | |
1553 | U16 prev; | |
1554 | ||
1555 | for (word=1; word <= trie->wordcount; word++) { | |
1556 | prev = 0; | |
1557 | if (trie->wordinfo[word].prev) | |
1558 | continue; | |
1559 | state = trie->wordinfo[word].accept; | |
1560 | while (state) { | |
1561 | state = prev_states[state]; | |
1562 | if (!state) | |
1563 | break; | |
1564 | prev = trie->states[state].wordnum; | |
1565 | if (prev) | |
1566 | break; | |
1567 | } | |
1568 | trie->wordinfo[word].prev = prev; | |
1569 | } | |
1570 | Safefree(prev_states); | |
1571 | } | |
1572 | ||
1573 | ||
1574 | /* and now dump out the compressed format */ | |
1575 | DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); | |
1576 | ||
1577 | RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; | |
1578 | #ifdef DEBUGGING | |
1579 | RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; | |
1580 | RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; | |
1581 | #else | |
1582 | SvREFCNT_dec_NN(revcharmap); | |
1583 | #endif | |
1584 | return trie->jump | |
1585 | ? MADE_JUMP_TRIE | |
1586 | : trie->startstate>1 | |
1587 | ? MADE_EXACT_TRIE | |
1588 | : MADE_TRIE; | |
1589 | } | |
1590 | ||
1591 | regnode * | |
1592 | Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) | |
1593 | { | |
1594 | /* The Trie is constructed and compressed now so we can build a fail array if | |
1595 | * it's needed | |
1596 | ||
1597 | This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and | |
1598 | 3.32 in the | |
1599 | "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, | |
1600 | Ullman 1985/88 | |
1601 | ISBN 0-201-10088-6 | |
1602 | ||
1603 | We find the fail state for each state in the trie, this state is the longest | |
1604 | proper suffix of the current state's 'word' that is also a proper prefix of | |
1605 | another word in our trie. State 1 represents the word '' and is thus the | |
1606 | default fail state. This allows the DFA not to have to restart after its | |
1607 | tried and failed a word at a given point, it simply continues as though it | |
1608 | had been matching the other word in the first place. | |
1609 | Consider | |
1610 | 'abcdgu'=~/abcdefg|cdgu/ | |
1611 | When we get to 'd' we are still matching the first word, we would encounter | |
1612 | 'g' which would fail, which would bring us to the state representing 'd' in | |
1613 | the second word where we would try 'g' and succeed, proceeding to match | |
1614 | 'cdgu'. | |
1615 | */ | |
1616 | /* add a fail transition */ | |
17e3e02a | 1617 | const U32 trie_offset = ARG1u(source); |
85900e28 YO |
1618 | reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; |
1619 | U32 *q; | |
1620 | const U32 ucharcount = trie->uniquecharcount; | |
1621 | const U32 numstates = trie->statecount; | |
1622 | const U32 ubound = trie->lasttrans + ucharcount; | |
1623 | U32 q_read = 0; | |
1624 | U32 q_write = 0; | |
1625 | U32 charid; | |
1626 | U32 base = trie->states[ 1 ].trans.base; | |
1627 | U32 *fail; | |
1628 | reg_ac_data *aho; | |
1629 | const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("T")); | |
1630 | regnode *stclass; | |
1631 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
1632 | ||
1633 | PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; | |
1634 | PERL_UNUSED_CONTEXT; | |
1635 | #ifndef DEBUGGING | |
1636 | PERL_UNUSED_ARG(depth); | |
1637 | #endif | |
1638 | ||
1639 | if ( OP(source) == TRIE ) { | |
3645ca4e YO |
1640 | tregnode_TRIE *op = (tregnode_TRIE *) |
1641 | PerlMemShared_calloc(1, sizeof(tregnode_TRIE)); | |
1642 | StructCopy(source, op, tregnode_TRIE); | |
85900e28 YO |
1643 | stclass = (regnode *)op; |
1644 | } else { | |
3645ca4e YO |
1645 | tregnode_TRIEC *op = (tregnode_TRIEC *) |
1646 | PerlMemShared_calloc(1, sizeof(tregnode_TRIEC)); | |
1647 | StructCopy(source, op, tregnode_TRIEC); | |
85900e28 YO |
1648 | stclass = (regnode *)op; |
1649 | } | |
1650 | OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ | |
1651 | ||
17e3e02a | 1652 | ARG1u_SET( stclass, data_slot ); |
85900e28 YO |
1653 | aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); |
1654 | RExC_rxi->data->data[ data_slot ] = (void*)aho; | |
1655 | aho->trie=trie_offset; | |
1656 | aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); | |
1657 | Copy( trie->states, aho->states, numstates, reg_trie_state ); | |
1658 | Newx( q, numstates, U32); | |
1659 | aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); | |
1660 | aho->refcount = 1; | |
1661 | fail = aho->fail; | |
1662 | /* initialize fail[0..1] to be 1 so that we always have | |
1663 | a valid final fail state */ | |
1664 | fail[ 0 ] = fail[ 1 ] = 1; | |
1665 | ||
1666 | for ( charid = 0; charid < ucharcount ; charid++ ) { | |
1667 | const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); | |
1668 | if ( newstate ) { | |
1669 | q[ q_write ] = newstate; | |
1670 | /* set to point at the root */ | |
1671 | fail[ q[ q_write++ ] ]=1; | |
1672 | } | |
1673 | } | |
1674 | while ( q_read < q_write) { | |
1675 | const U32 cur = q[ q_read++ % numstates ]; | |
1676 | base = trie->states[ cur ].trans.base; | |
1677 | ||
1678 | for ( charid = 0 ; charid < ucharcount ; charid++ ) { | |
1679 | const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); | |
1680 | if (ch_state) { | |
1681 | U32 fail_state = cur; | |
1682 | U32 fail_base; | |
1683 | do { | |
1684 | fail_state = fail[ fail_state ]; | |
1685 | fail_base = aho->states[ fail_state ].trans.base; | |
1686 | } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); | |
1687 | ||
1688 | fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); | |
1689 | fail[ ch_state ] = fail_state; | |
1690 | if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) | |
1691 | { | |
1692 | aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; | |
1693 | } | |
1694 | q[ q_write++ % numstates] = ch_state; | |
1695 | } | |
1696 | } | |
1697 | } | |
1698 | /* restore fail[0..1] to 0 so that we "fall out" of the AC loop | |
1699 | when we fail in state 1, this allows us to use the | |
1700 | charclass scan to find a valid start char. This is based on the principle | |
1701 | that theres a good chance the string being searched contains lots of stuff | |
1702 | that cant be a start char. | |
1703 | */ | |
1704 | fail[ 0 ] = fail[ 1 ] = 0; | |
1705 | DEBUG_TRIE_COMPILE_r({ | |
1706 | Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0", | |
1707 | depth, (UV)numstates | |
1708 | ); | |
1709 | for( q_read=1; q_read<numstates; q_read++ ) { | |
1710 | Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]); | |
1711 | } | |
1712 | Perl_re_printf( aTHX_ "\n"); | |
1713 | }); | |
1714 | Safefree(q); | |
1715 | /*RExC_seen |= REG_TRIEDFA_SEEN;*/ | |
1716 | return stclass; | |
1717 | } |