perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	#ifdef PERL_EXT_RE_BUILD
	2	#include "re_top.h"
	3	#endif
	4
	5	#include "EXTERN.h"
	6	#define PERL_IN_REGEX_ENGINE
	7	#define PERL_IN_REGCOMP_ANY
	8	#define PERL_IN_REGCOMP_TRIE_C
	9	#include "perl.h"
	10
	11	#ifdef PERL_IN_XSUB_RE
	12	# include "re_comp.h"
	13	#else
	14	# include "regcomp.h"
	15	#endif
	16
	17	#include "invlist_inline.h"
	18	#include "unicode_constants.h"
	19	#include "regcomp_internal.h"
	20
	21	#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
	22	#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
	23	#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
	24	#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
	25	? (TRIE_LIST_CUR( idx ) - 1) \
	26	: 0 )
	27
	28
	29	#ifdef DEBUGGING
	30	/*
	31	dump_trie(trie,widecharmap,revcharmap)
	32	dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
	33	dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
	34
	35	These routines dump out a trie in a somewhat readable format.
	36	The _interim_ variants are used for debugging the interim
	37	tables that are used to generate the final compressed
	38	representation which is what dump_trie expects.
	39
	40	Part of the reason for their existence is to provide a form
	41	of documentation as to how the different representations function.
	42
	43	*/
	44
	45	/*
	46	Dumps the final compressed table form of the trie to Perl_debug_log.
	47	Used for debugging make_trie().
	48	*/
	49
	50	STATIC void
	51	S_dump_trie(pTHX_ const struct _reg_trie_data trie, HV widecharmap,
	52	AV *revcharmap, U32 depth)
	53	{
	54	U32 state;
	55	SV *sv=sv_newmortal();
	56	int colwidth= widecharmap ? 6 : 4;
	57	U16 word;
	58	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	59
	60	PERL_ARGS_ASSERT_DUMP_TRIE;
	61
	62	Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
	63	depth+1, "Match","Base","Ofs" );
	64
	65	for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
	66	SV ** const tmp = av_fetch_simple( revcharmap, state, 0);
	67	if ( tmp ) {
	68	Perl_re_printf( aTHX_ "%*s",
	69	colwidth,
	70	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	71	PL_colors[0], PL_colors[1],
	72	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	73	PERL_PV_ESCAPE_FIRSTCHAR
	74	)
	75	);
	76	}
	77	}
	78	Perl_re_printf( aTHX_ "\n");
	79	Perl_re_indentf( aTHX_ "State\|-----------------------", depth+1);
	80
	81	for( state = 0 ; state < trie->uniquecharcount ; state++ )
	82	Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
	83	Perl_re_printf( aTHX_ "\n");
	84
	85	for( state = 1 ; state < trie->statecount ; state++ ) {
	86	const U32 base = trie->states[ state ].trans.base;
	87
	88	Perl_re_indentf( aTHX_ "#%4" UVXf "\|", depth+1, (UV)state);
	89
	90	if ( trie->states[ state ].wordnum ) {
	91	Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
	92	} else {
	93	Perl_re_printf( aTHX_ "%6s", "" );
	94	}
	95
	96	Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
	97
	98	if ( base ) {
	99	U32 ofs = 0;
	100
	101	while( ( base + ofs < trie->uniquecharcount ) \|\|
	102	( base + ofs - trie->uniquecharcount < trie->lasttrans
	103	&& trie->trans[ base + ofs - trie->uniquecharcount ].check
	104	!= state))
	105	ofs++;
	106
	107	Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
	108
	109	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	110	if ( ( base + ofs >= trie->uniquecharcount )
	111	&& ( base + ofs - trie->uniquecharcount
	112	< trie->lasttrans )
	113	&& trie->trans[ base + ofs
	114	- trie->uniquecharcount ].check == state )
	115	{
	116	Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
	117	(UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
	118	);
	119	} else {
	120	Perl_re_printf( aTHX_ "%*s", colwidth," ." );
	121	}
	122	}
	123
	124	Perl_re_printf( aTHX_ "]");
	125
	126	}
	127	Perl_re_printf( aTHX_ "\n" );
	128	}
	129	Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
	130	depth);
	131	for (word=1; word <= trie->wordcount; word++) {
	132	Perl_re_printf( aTHX_ " %d:(%d,%d)",
	133	(int)word, (int)(trie->wordinfo[word].prev),
	134	(int)(trie->wordinfo[word].len));
	135	}
	136	Perl_re_printf( aTHX_ "\n" );
	137	}
	138	/*
	139	Dumps a fully constructed but uncompressed trie in list form.
	140	List tries normally only are used for construction when the number of
	141	possible chars (trie->uniquecharcount) is very high.
	142	Used for debugging make_trie().
	143	*/
	144	STATIC void
	145	S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
	146	HV widecharmap, AV revcharmap, U32 next_alloc,
	147	U32 depth)
	148	{
	149	U32 state;
	150	SV *sv=sv_newmortal();
	151	int colwidth= widecharmap ? 6 : 4;
	152	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	153
	154	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
	155
	156	/* print out the table precompression. */
	157	Perl_re_indentf( aTHX_ "State :Word \| Transition Data\n",
	158	depth+1 );
	159	Perl_re_indentf( aTHX_ "%s",
	160	depth+1, "------:-----+-----------------\n" );
	161
	162	for( state=1 ; state < next_alloc ; state ++ ) {
	163	U16 charid;
	164
	165	Perl_re_indentf( aTHX_ " %4" UVXf " :",
	166	depth+1, (UV)state );
	167	if ( ! trie->states[ state ].wordnum ) {
	168	Perl_re_printf( aTHX_ "%5s\| ","");
	169	} else {
	170	Perl_re_printf( aTHX_ "W%4x\| ",
	171	trie->states[ state ].wordnum
	172	);
	173	}
	174	for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
	175	SV ** const tmp = av_fetch_simple( revcharmap,
	176	TRIE_LIST_ITEM(state, charid).forid, 0);
	177	if ( tmp ) {
	178	Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " \| ",
	179	colwidth,
	180	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp),
	181	colwidth,
	182	PL_colors[0], PL_colors[1],
	183	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
	184	\| PERL_PV_ESCAPE_FIRSTCHAR
	185	) ,
	186	TRIE_LIST_ITEM(state, charid).forid,
	187	(UV)TRIE_LIST_ITEM(state, charid).newstate
	188	);
	189	if (!(charid % 10))
	190	Perl_re_printf( aTHX_ "\n%*s\| ",
	191	(int)((depth * 2) + 14), "");
	192	}
	193	}
	194	Perl_re_printf( aTHX_ "\n");
	195	}
	196	}
	197
	198	/*
	199	Dumps a fully constructed but uncompressed trie in table form.
	200	This is the normal DFA style state transition table, with a few
	201	twists to facilitate compression later.
	202	Used for debugging make_trie().
	203	*/
	204	STATIC void
	205	S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
	206	HV widecharmap, AV revcharmap, U32 next_alloc,
	207	U32 depth)
	208	{
	209	U32 state;
	210	U16 charid;
	211	SV *sv=sv_newmortal();
	212	int colwidth= widecharmap ? 6 : 4;
	213	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	214
	215	PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
	216
	217	/*
	218	print out the table precompression so that we can do a visual check
	219	that they are identical.
	220	*/
	221
	222	Perl_re_indentf( aTHX_ "Char : ", depth+1 );
	223
	224	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	225	SV ** const tmp = av_fetch_simple( revcharmap, charid, 0);
	226	if ( tmp ) {
	227	Perl_re_printf( aTHX_ "%*s",
	228	colwidth,
	229	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), colwidth,
	230	PL_colors[0], PL_colors[1],
	231	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	232	PERL_PV_ESCAPE_FIRSTCHAR
	233	)
	234	);
	235	}
	236	}
	237
	238	Perl_re_printf( aTHX_ "\n");
	239	Perl_re_indentf( aTHX_ "State+-", depth+1 );
	240
	241	for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
	242	Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
	243	}
	244
	245	Perl_re_printf( aTHX_ "\n" );
	246
	247	for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
	248
	249	Perl_re_indentf( aTHX_ "%4" UVXf " : ",
	250	depth+1,
	251	(UV)TRIE_NODENUM( state ) );
	252
	253	for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
	254	UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
	255	if (v)
	256	Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
	257	else
	258	Perl_re_printf( aTHX_ "%*s", colwidth, "." );
	259	}
	260	if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
	261	Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
	262	(UV)trie->trans[ state ].check );
	263	} else {
	264	Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
	265	(UV)trie->trans[ state ].check,
	266	trie->states[ TRIE_NODENUM( state ) ].wordnum );
	267	}
	268	}
	269	}
	270
	271	#endif
	272
	273
	274	/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
	275	startbranch: the first branch in the whole branch sequence
	276	first : start branch of sequence of branch-exact nodes.
	277	May be the same as startbranch
	278	last : Thing following the last branch.
	279	May be the same as tail.
	280	tail : item following the branch sequence
	281	count : words in the sequence
	282	flags : currently the OP() type we will be building one of /EXACT(\|F\|FA\|FU\|FU_SS\|L\|FLU8)/
	283	depth : indent depth
	284
	285	Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
	286
	287	A trie is an N'ary tree where the branches are determined by digital
	288	decomposition of the key. IE, at the root node you look up the 1st character and
	289	follow that branch repeat until you find the end of the branches. Nodes can be
	290	marked as "accepting" meaning they represent a complete word. Eg:
	291
	292	/he\|she\|his\|hers/
	293
	294	would convert into the following structure. Numbers represent states, letters
	295	following numbers represent valid transitions on the letter from that state, if
	296	the number is in square brackets it represents an accepting state, otherwise it
	297	will be in parenthesis.
	298
	299	+-h->+-e->[3]-+-r->(8)-+-s->[9]
	300	\| \|
	301	\| (2)
	302	\| \|
	303	(1) +-i->(6)-+-s->[7]
	304	\|
	305	+-s->(3)-+-h->(4)-+-e->[5]
	306
	307	Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
	308
	309	This shows that when matching against the string 'hers' we will begin at state 1
	310	read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
	311	then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
	312	is also accepting. Thus we know that we can match both 'he' and 'hers' with a
	313	single traverse. We store a mapping from accepting to state to which word was
	314	matched, and then when we have multiple possibilities we try to complete the
	315	rest of the regex in the order in which they occurred in the alternation.
	316
	317	The only prior NFA like behaviour that would be changed by the TRIE support is
	318	the silent ignoring of duplicate alternations which are of the form:
	319
	320	/ (DUPE\|DUPE) X? (?{ ... }) Y /x
	321
	322	Thus EVAL blocks following a trie may be called a different number of times with
	323	and without the optimisation. With the optimisations dupes will be silently
	324	ignored. This inconsistent behaviour of EVAL type nodes is well established as
	325	the following demonstrates:
	326
	327	'words'=~/(word\|word\|word)(?{ print $1 })[xyz]/
	328
	329	which prints out 'word' three times, but
	330
	331	'words'=~/(word\|word\|word)(?{ print $1 })S/
	332
	333	which doesnt print it out at all. This is due to other optimisations kicking in.
	334
	335	Example of what happens on a structural level:
	336
	337	The regexp /(ac\|ad\|ab)+/ will produce the following debug output:
	338
	339	1: CURLYM[1] {1,32767}(18)
	340	5: BRANCH(8)
	341	6: EXACT <ac>(16)
	342	8: BRANCH(11)
	343	9: EXACT <ad>(16)
	344	11: BRANCH(14)
	345	12: EXACT <ab>(16)
	346	16: SUCCEED(0)
	347	17: NOTHING(18)
	348	18: END(0)
	349
	350	This would be optimizable with startbranch=5, first=5, last=16, tail=16
	351	and should turn into:
	352
	353	1: CURLYM[1] {1,32767}(18)
	354	5: TRIE(16)
	355	[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
	356	<ac>
	357	<ad>
	358	<ab>
	359	16: SUCCEED(0)
	360	17: NOTHING(18)
	361	18: END(0)
	362
	363	Cases where tail != last would be like /(?foo\|bar)baz/:
	364
	365	1: BRANCH(4)
	366	2: EXACT <foo>(8)
	367	4: BRANCH(7)
	368	5: EXACT <bar>(8)
	369	7: TAIL(8)
	370	8: EXACT <baz>(10)
	371	10: END(0)
	372
	373	which would be optimizable with startbranch=1, first=1, last=7, tail=8
	374	and would end up looking like:
	375
	376	1: TRIE(8)
	377	[Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
	378	<foo>
	379	<bar>
	380	7: TAIL(8)
	381	8: EXACT <baz>(10)
	382	10: END(0)
	383
	384	d = uvchr_to_utf8_flags(d, uv, 0);
	385
	386	is the recommended Unicode-aware way of saying
	387
	388	*(d++) = uv;
	389	*/
	390
	391	#define TRIE_STORE_REVCHAR(val) \
	392	STMT_START { \
	393	if (UTF) { \
	394	SV *zlopp = newSV(UTF8_MAXBYTES); \
	395	unsigned char flrbbbbb = (unsigned char ) SvPVX(zlopp); \
	396	unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
	397	*kapow = '\0'; \
	398	SvCUR_set(zlopp, kapow - flrbbbbb); \
	399	SvPOK_on(zlopp); \
	400	SvUTF8_on(zlopp); \
	401	av_push_simple(revcharmap, zlopp); \
	402	} else { \
	403	char ooooff = (char)val; \
	404	av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \
	405	} \
	406	} STMT_END
	407
	408	/* This gets the next character from the input, folding it if not already
	409	* folded. */
	410	#define TRIE_READ_CHAR STMT_START { \
	411	wordlen++; \
	412	if ( UTF ) { \
	413	/* if it is UTF then it is either already folded, or does not need \
	414	* folding */ \
	415	uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
	416	} \
	417	else if (folder == PL_fold_latin1) { \
	418	/* This folder implies Unicode rules, which in the range expressible \
	419	* by not UTF is the lower case, with the two exceptions, one of \
	420	* which should have been taken care of before calling this */ \
	421	assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
	422	uvc = toLOWER_L1(*uc); \
	423	if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
	424	len = 1; \
	425	} else { \
	426	/* raw data, will be folded later if needed */ \
	427	uvc = (U32)*uc; \
	428	len = 1; \
	429	} \
	430	} STMT_END
	431
	432
	433
	434	#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
	435	if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
	436	U32 ging = TRIE_LIST_LEN( state ) * 2; \
	437	Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
	438	TRIE_LIST_LEN( state ) = ging; \
	439	} \
	440	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
	441	TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
	442	TRIE_LIST_CUR( state )++; \
	443	} STMT_END
	444
	445	#define TRIE_LIST_NEW(state) STMT_START { \
	446	Newx( trie->states[ state ].trans.list, \
	447	4, reg_trie_trans_le ); \
	448	TRIE_LIST_CUR( state ) = 1; \
	449	TRIE_LIST_LEN( state ) = 4; \
	450	} STMT_END
	451
	452	#define TRIE_HANDLE_WORD(state) STMT_START { \
	453	U16 dupe= trie->states[ state ].wordnum; \
	454	regnode * const noper_next = regnext( noper ); \
	455	\
	456	DEBUG_r({ \
	457	/* store the word for dumping */ \
	458	SV* tmp; \
	459	if (OP(noper) != NOTHING) \
	460	tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
	461	else \
	462	tmp = newSVpvn_utf8( "", 0, UTF ); \
	463	av_push_simple( trie_words, tmp ); \
	464	}); \
	465	\
	466	curword++; \
	467	trie->wordinfo[curword].prev = 0; \
	468	trie->wordinfo[curword].len = wordlen; \
	469	trie->wordinfo[curword].accept = state; \
	470	\
	471	if ( noper_next < tail ) { \
	472	if (!trie->jump) { \
	473	trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
	474	sizeof(U16) ); \
	475	trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
	476	sizeof(U16) ); \
	477	trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
	478	sizeof(U16) ); \
	479	} \
	480	trie->jump[curword] = (U16)(noper_next - convert); \
	481	U16 set_before_paren; \
	482	U16 set_after_paren; \
	483	if (OP(cur) == BRANCH) { \
	484	set_before_paren = ARG1a(cur); \
	485	set_after_paren = ARG1b(cur); \
	486	} else { \
	487	set_before_paren = ARG2a(cur); \
	488	set_after_paren = ARG2b(cur); \
	489	} \
	490	trie->j_before_paren[curword] = set_before_paren; \
	491	trie->j_after_paren[curword] = set_after_paren; \
	492	if (!jumper) \
	493	jumper = noper_next; \
	494	if (!nextbranch) \
	495	nextbranch= regnext(cur); \
	496	} \
	497	\
	498	if ( dupe ) { \
	499	/* It's a dupe. Pre-insert into the wordinfo[].prev */\
	500	/* chain, so that when the bits of chain are later */\
	501	/* linked together, the dups appear in the chain */\
	502	trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
	503	trie->wordinfo[dupe].prev = curword; \
	504	} else { \
	505	/* we haven't inserted this word yet. */ \
	506	trie->states[ state ].wordnum = curword; \
	507	} \
	508	} STMT_END
	509
	510
	511	#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
	512	( ( base + charid >= ucharcount \
	513	&& base + charid < ubound \
	514	&& state == trie->trans[ base - ucharcount + charid ].check \
	515	&& trie->trans[ base - ucharcount + charid ].next ) \
	516	? trie->trans[ base - ucharcount + charid ].next \
	517	: ( state==1 ? special : 0 ) \
	518	)
	519
	520	#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
	521	STMT_START { \
	522	TRIE_BITMAP_SET(trie, uvc); \
	523	/* store the folded codepoint */ \
	524	if ( folder ) \
	525	TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
	526	\
	527	if ( !UTF ) { \
	528	/* store first byte of utf8 representation of */ \
	529	/* variant codepoints */ \
	530	if (! UVCHR_IS_INVARIANT(uvc)) { \
	531	TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
	532	} \
	533	} \
	534	} STMT_END
	535
	536	I32
	537	Perl_make_trie(pTHX_ RExC_state_t pRExC_state, regnode startbranch,
	538	regnode first, regnode last, regnode *tail,
	539	U32 word_count, U32 flags, U32 depth)
	540	{
	541	/* first pass, loop through and scan words */
	542	reg_trie_data *trie;
	543	HV *widecharmap = NULL;
	544	AV *revcharmap = newAV();
	545	regnode *cur;
	546	STRLEN len = 0;
	547	UV uvc = 0;
	548	U16 curword = 0;
	549	U32 next_alloc = 0;
	550	regnode *jumper = NULL;
	551	regnode *nextbranch = NULL;
	552	regnode *lastbranch = NULL;
	553	regnode *convert = NULL;
	554	U32 prev_states; / temp array mapping each state to previous one */
	555	/* we just use folder as a flag in utf8 */
	556	const U8 * folder = NULL;
	557
	558	/* in the below reg_add_data call we are storing either 'tu' or 'tuaa'
	559	* which stands for one trie structure, one hash, optionally followed
	560	* by two arrays */
	561	#ifdef DEBUGGING
	562	const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tuaa"));
	563	AV *trie_words = NULL;
	564	/* along with revcharmap, this only used during construction but both are
	565	* useful during debugging so we store them in the struct when debugging.
	566	*/
	567	#else
	568	const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tu"));
	569	STRLEN trie_charcount=0;
	570	#endif
	571	SV *re_trie_maxbuff;
	572	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	573
	574	PERL_ARGS_ASSERT_MAKE_TRIE;
	575	#ifndef DEBUGGING
	576	PERL_UNUSED_ARG(depth);
	577	#endif
	578
	579	switch (flags) {
	580	case EXACT: case EXACT_REQ8: case EXACTL: break;
	581	case EXACTFAA:
	582	case EXACTFUP:
	583	case EXACTFU:
	584	case EXACTFLU8: folder = PL_fold_latin1; break;
	585	case EXACTF: folder = PL_fold; break;
	586	default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
	587	}
	588
	589	/* create the trie struct, all zeroed */
	590	trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
	591	trie->refcount = 1;
	592	trie->startstate = 1;
	593	trie->wordcount = word_count;
	594	RExC_rxi->data->data[ data_slot ] = (void*)trie;
	595	trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
	596	if (flags == EXACT \|\| flags == EXACT_REQ8 \|\| flags == EXACTL)
	597	trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
	598	trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
	599	trie->wordcount+1, sizeof(reg_trie_wordinfo));
	600
	601	DEBUG_r({
	602	trie_words = newAV();
	603	});
	604
	605	re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
	606	assert(re_trie_maxbuff);
	607	if (!SvIOK(re_trie_maxbuff)) {
	608	sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
	609	}
	610	DEBUG_TRIE_COMPILE_r({
	611	Perl_re_indentf( aTHX_
	612	"make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
	613	depth+1,
	614	REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
	615	REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
	616	});
	617
	618	/* Find the node we are going to overwrite */
	619	if ( first == startbranch && OP( last ) != BRANCH ) {
	620	/* whole branch chain */
	621	convert = first;
	622	} else {
	623	/* branch sub-chain */
	624	convert = REGNODE_AFTER( first );
	625	}
	626
	627	/* -- First loop and Setup --
	628
	629	We first traverse the branches and scan each word to determine if it
	630	contains widechars, and how many unique chars there are, this is
	631	important as we have to build a table with at least as many columns as we
	632	have unique chars.
	633
	634	We use an array of integers to represent the character codes 0..255
	635	(trie->charmap) and we use a an HV* to store Unicode characters. We use
	636	the native representation of the character value as the key and IV's for
	637	the coded index.
	638
	639	TODO If we keep track of how many times each character is used we can
	640	remap the columns so that the table compression later on is more
	641	efficient in terms of memory by ensuring the most common value is in the
	642	middle and the least common are on the outside. IMO this would be better
	643	than a most to least common mapping as theres a decent chance the most
	644	common letter will share a node with the least common, meaning the node
	645	will not be compressible. With a middle is most common approach the worst
	646	case is when we have the least common nodes twice.
	647
	648	*/
	649
	650	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	651	regnode *noper = REGNODE_AFTER( cur );
	652	const U8 *uc;
	653	const U8 *e;
	654	int foldlen = 0;
	655	U32 wordlen = 0; /* required init */
	656	STRLEN minchars = 0;
	657	STRLEN maxchars = 0;
	658	bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
	659	bitmap?*/
	660	lastbranch = cur;
	661
	662	if (OP(noper) == NOTHING) {
	663	/* skip past a NOTHING at the start of an alternation
	664	* eg, /(?:)a\|(?:b)/ should be the same as /a\|b/
	665	*
	666	* If the next node is not something we are supposed to process
	667	* we will just ignore it due to the condition guarding the
	668	* next block.
	669	*/
	670
	671	regnode *noper_next= regnext(noper);
	672	if (noper_next < tail)
	673	noper= noper_next;
	674	}
	675
	676	if ( noper < tail
	677	&& ( OP(noper) == flags
	678	\|\| (flags == EXACT && OP(noper) == EXACT_REQ8)
	679	\|\| (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
	680	\|\| OP(noper) == EXACTFUP))))
	681	{
	682	uc= (U8*)STRING(noper);
	683	e= uc + STR_LEN(noper);
	684	} else {
	685	trie->minlen= 0;
	686	continue;
	687	}
	688
	689
	690	if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
	691	TRIE_BITMAP_SET(trie,uc); / store the raw first byte
	692	regardless of encoding */
	693	if (OP( noper ) == EXACTFUP) {
	694	/* false positives are ok, so just set this */
	695	TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
	696	}
	697	}
	698
	699	for ( ; uc < e ; uc += len ) { /* Look at each char in the current
	700	branch */
	701	TRIE_CHARCOUNT(trie)++;
	702	TRIE_READ_CHAR;
	703
	704	/* TRIE_READ_CHAR returns the current character, or its fold if /i
	705	* is in effect. Under /i, this character can match itself, or
	706	* anything that folds to it. If not under /i, it can match just
	707	* itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
	708	* all fold to k, and all are single characters. But some folds
	709	* expand to more than one character, so for example LATIN SMALL
	710	* LIGATURE FFI folds to the three character sequence 'ffi'. If
	711	* the string beginning at 'uc' is 'ffi', it could be matched by
	712	* three characters, or just by the one ligature character. (It
	713	* could also be matched by two characters: LATIN SMALL LIGATURE FF
	714	* followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
	715	* (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
	716	* match.) The trie needs to know the minimum and maximum number
	717	* of characters that could match so that it can use size alone to
	718	* quickly reject many match attempts. The max is simple: it is
	719	* the number of folded characters in this branch (since a fold is
	720	* never shorter than what folds to it. */
	721
	722	maxchars++;
	723
	724	/* And the min is equal to the max if not under /i (indicated by
	725	* 'folder' being NULL), or there are no multi-character folds. If
	726	* there is a multi-character fold, the min is incremented just
	727	* once, for the character that folds to the sequence. Each
	728	* character in the sequence needs to be added to the list below of
	729	* characters in the trie, but we count only the first towards the
	730	* min number of characters needed. This is done through the
	731	* variable 'foldlen', which is returned by the macros that look
	732	* for these sequences as the number of bytes the sequence
	733	* occupies. Each time through the loop, we decrement 'foldlen' by
	734	* how many bytes the current char occupies. Only when it reaches
	735	* 0 do we increment 'minchars' or look for another multi-character
	736	* sequence. */
	737	if (folder == NULL) {
	738	minchars++;
	739	}
	740	else if (foldlen > 0) {
	741	foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
	742	}
	743	else {
	744	minchars++;
	745
	746	/* See if *uc is the beginning of a multi-character fold. If
	747	* so, we decrement the length remaining to look at, to account
	748	* for the current character this iteration. (We can use 'uc'
	749	* instead of the fold returned by TRIE_READ_CHAR because the
	750	* macro is smart enough to account for any unfolded
	751	* characters. */
	752	if (UTF) {
	753	if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
	754	foldlen -= UTF8SKIP(uc);
	755	}
	756	}
	757	else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
	758	foldlen--;
	759	}
	760	}
	761
	762	/* The current character (and any potential folds) should be added
	763	* to the possible matching characters for this position in this
	764	* branch */
	765	if ( uvc < 256 ) {
	766	if ( folder ) {
	767	U8 folded= folder[ (U8) uvc ];
	768	if ( !trie->charmap[ folded ] ) {
	769	trie->charmap[ folded ]=( ++trie->uniquecharcount );
	770	TRIE_STORE_REVCHAR( folded );
	771	}
	772	}
	773	if ( !trie->charmap[ uvc ] ) {
	774	trie->charmap[ uvc ]=( ++trie->uniquecharcount );
	775	TRIE_STORE_REVCHAR( uvc );
	776	}
	777	if ( set_bit ) {
	778	/* store the codepoint in the bitmap, and its folded
	779	* equivalent. */
	780	TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
	781	set_bit = 0; /* We've done our bit :-) */
	782	}
	783	} else {
	784
	785	/* XXX We could come up with the list of code points that fold
	786	* to this using PL_utf8_foldclosures, except not for
	787	* multi-char folds, as there may be multiple combinations
	788	* there that could work, which needs to wait until runtime to
	789	* resolve (The comment about LIGATURE FFI above is such an
	790	* example */
	791
	792	SV** svpp;
	793	if ( !widecharmap )
	794	widecharmap = newHV();
	795
	796	svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
	797
	798	if ( !svpp )
	799	Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
	800
	801	if ( !SvTRUE( *svpp ) ) {
	802	sv_setiv( *svpp, ++trie->uniquecharcount );
	803	TRIE_STORE_REVCHAR(uvc);
	804	}
	805	}
	806	} /* end loop through characters in this branch of the trie */
	807
	808	/* We take the min and max for this branch and combine to find the min
	809	* and max for all branches processed so far */
	810	if( cur == first ) {
	811	trie->minlen = minchars;
	812	trie->maxlen = maxchars;
	813	} else if (minchars < trie->minlen) {
	814	trie->minlen = minchars;
	815	} else if (maxchars > trie->maxlen) {
	816	trie->maxlen = maxchars;
	817	}
	818	} /* end first pass */
	819	trie->before_paren = OP(first) == BRANCH
	820	? ARG1a(first)
	821	: ARG2a(first); /* BRANCHJ */
	822
	823	trie->after_paren = OP(lastbranch) == BRANCH
	824	? ARG1b(lastbranch)
	825	: ARG2b(lastbranch); /* BRANCHJ */
	826	DEBUG_TRIE_COMPILE_r(
	827	Perl_re_indentf( aTHX_
	828	"TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
	829	depth+1,
	830	( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
	831	(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
	832	(int)trie->minlen, (int)trie->maxlen )
	833	);
	834
	835	/*
	836	We now know what we are dealing with in terms of unique chars and
	837	string sizes so we can calculate how much memory a naive
	838	representation using a flat table will take. If it's over a reasonable
	839	limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
	840	conservative but potentially much slower representation using an array
	841	of lists.
	842
	843	At the end we convert both representations into the same compressed
	844	form that will be used in regexec.c for matching with. The latter
	845	is a form that cannot be used to construct with but has memory
	846	properties similar to the list form and access properties similar
	847	to the table form making it both suitable for fast searches and
	848	small enough that its feasable to store for the duration of a program.
	849
	850	See the comment in the code where the compressed table is produced
	851	inplace from the flat tabe representation for an explanation of how
	852	the compression works.
	853
	854	*/
	855
	856
	857	Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
	858	prev_states[1] = 0;
	859
	860	if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
	861	> SvIV(re_trie_maxbuff) )
	862	{
	863	/*
	864	Second Pass -- Array Of Lists Representation
	865
	866	Each state will be represented by a list of charid:state records
	867	(reg_trie_trans_le) the first such element holds the CUR and LEN
	868	points of the allocated array. (See defines above).
	869
	870	We build the initial structure using the lists, and then convert
	871	it into the compressed table form which allows faster lookups
	872	(but cant be modified once converted).
	873	*/
	874
	875	STRLEN transcount = 1;
	876
	877	DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
	878	depth+1));
	879
	880	trie->states = (reg_trie_state *)
	881	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	882	sizeof(reg_trie_state) );
	883	TRIE_LIST_NEW(1);
	884	next_alloc = 2;
	885
	886	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	887
	888	regnode *noper = REGNODE_AFTER( cur );
	889	U32 state = 1; /* required init */
	890	U16 charid = 0; /* sanity init */
	891	U32 wordlen = 0; /* required init */
	892
	893	if (OP(noper) == NOTHING) {
	894	regnode *noper_next= regnext(noper);
	895	if (noper_next < tail)
	896	noper= noper_next;
	897	/* we will undo this assignment if noper does not
	898	* point at a trieable type in the else clause of
	899	* the following statement. */
	900	}
	901
	902	if ( noper < tail
	903	&& ( OP(noper) == flags
	904	\|\| (flags == EXACT && OP(noper) == EXACT_REQ8)
	905	\|\| (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
	906	\|\| OP(noper) == EXACTFUP))))
	907	{
	908	const U8 uc= (U8)STRING(noper);
	909	const U8 *e= uc + STR_LEN(noper);
	910
	911	for ( ; uc < e ; uc += len ) {
	912
	913	TRIE_READ_CHAR;
	914
	915	if ( uvc < 256 ) {
	916	charid = trie->charmap[ uvc ];
	917	} else {
	918	SV** const svpp = hv_fetch( widecharmap,
	919	(char*)&uvc,
	920	sizeof( UV ),
	921	0);
	922	if ( !svpp ) {
	923	charid = 0;
	924	} else {
	925	charid=(U16)SvIV( *svpp );
	926	}
	927	}
	928	/* charid is now 0 if we dont know the char read, or
	929	* nonzero if we do */
	930	if ( charid ) {
	931
	932	U16 check;
	933	U32 newstate = 0;
	934
	935	charid--;
	936	if ( !trie->states[ state ].trans.list ) {
	937	TRIE_LIST_NEW( state );
	938	}
	939	for ( check = 1;
	940	check <= TRIE_LIST_USED( state );
	941	check++ )
	942	{
	943	if ( TRIE_LIST_ITEM( state, check ).forid
	944	== charid )
	945	{
	946	newstate = TRIE_LIST_ITEM( state, check ).newstate;
	947	break;
	948	}
	949	}
	950	if ( ! newstate ) {
	951	newstate = next_alloc++;
	952	prev_states[newstate] = state;
	953	TRIE_LIST_PUSH( state, charid, newstate );
	954	transcount++;
	955	}
	956	state = newstate;
	957	} else {
	958	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
	959	}
	960	}
	961	} else {
	962	/* If we end up here it is because we skipped past a NOTHING, but did not end up
	963	* on a trieable type. So we need to reset noper back to point at the first regop
	964	* in the branch before we call TRIE_HANDLE_WORD()
	965	*/
	966	noper= REGNODE_AFTER(cur);
	967	}
	968	TRIE_HANDLE_WORD(state);
	969
	970	} /* end second pass */
	971
	972	/* next alloc is the NEXT state to be allocated */
	973	trie->statecount = next_alloc;
	974	trie->states = (reg_trie_state *)
	975	PerlMemShared_realloc( trie->states,
	976	next_alloc
	977	* sizeof(reg_trie_state) );
	978
	979	/* and now dump it out before we compress it */
	980	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
	981	revcharmap, next_alloc,
	982	depth+1)
	983	);
	984
	985	trie->trans = (reg_trie_trans *)
	986	PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
	987	{
	988	U32 state;
	989	U32 tp = 0;
	990	U32 zp = 0;
	991
	992
	993	for( state=1 ; state < next_alloc ; state ++ ) {
	994	U32 base=0;
	995
	996	/*
	997	DEBUG_TRIE_COMPILE_MORE_r(
	998	Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
	999	);
	1000	*/
	1001
	1002	if (trie->states[state].trans.list) {
	1003	U16 minid=TRIE_LIST_ITEM( state, 1).forid;
	1004	U16 maxid=minid;
	1005	U16 idx;
	1006
	1007	for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1008	const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
	1009	if ( forid < minid ) {
	1010	minid=forid;
	1011	} else if ( forid > maxid ) {
	1012	maxid=forid;
	1013	}
	1014	}
	1015	if ( transcount < tp + maxid - minid + 1) {
	1016	transcount *= 2;
	1017	trie->trans = (reg_trie_trans *)
	1018	PerlMemShared_realloc( trie->trans,
	1019	transcount
	1020	* sizeof(reg_trie_trans) );
	1021	Zero( trie->trans + (transcount / 2),
	1022	transcount / 2,
	1023	reg_trie_trans );
	1024	}
	1025	base = trie->uniquecharcount + tp - minid;
	1026	if ( maxid == minid ) {
	1027	U32 set = 0;
	1028	for ( ; zp < tp ; zp++ ) {
	1029	if ( ! trie->trans[ zp ].next ) {
	1030	base = trie->uniquecharcount + zp - minid;
	1031	trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
	1032	1).newstate;
	1033	trie->trans[ zp ].check = state;
	1034	set = 1;
	1035	break;
	1036	}
	1037	}
	1038	if ( !set ) {
	1039	trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
	1040	1).newstate;
	1041	trie->trans[ tp ].check = state;
	1042	tp++;
	1043	zp = tp;
	1044	}
	1045	} else {
	1046	for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
	1047	const U32 tid = base
	1048	- trie->uniquecharcount
	1049	+ TRIE_LIST_ITEM( state, idx ).forid;
	1050	trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
	1051	idx ).newstate;
	1052	trie->trans[ tid ].check = state;
	1053	}
	1054	tp += ( maxid - minid + 1 );
	1055	}
	1056	Safefree(trie->states[ state ].trans.list);
	1057	}
	1058	/*
	1059	DEBUG_TRIE_COMPILE_MORE_r(
	1060	Perl_re_printf( aTHX_ " base: %d\n",base);
	1061	);
	1062	*/
	1063	trie->states[ state ].trans.base=base;
	1064	}
	1065	trie->lasttrans = tp + 1;
	1066	}
	1067	} else {
	1068	/*
	1069	Second Pass -- Flat Table Representation.
	1070
	1071	we dont use the 0 slot of either trans[] or states[] so we add 1 to
	1072	each. We know that we will need Charcount+1 trans at most to store
	1073	the data (one row per char at worst case) So we preallocate both
	1074	structures assuming worst case.
	1075
	1076	We then construct the trie using only the .next slots of the entry
	1077	structs.
	1078
	1079	We use the .check field of the first entry of the node temporarily
	1080	to make compression both faster and easier by keeping track of how
	1081	many non zero fields are in the node.
	1082
	1083	Since trans are numbered from 1 any 0 pointer in the table is a FAIL
	1084	transition.
	1085
	1086	There are two terms at use here: state as a TRIE_NODEIDX() which is
	1087	a number representing the first entry of the node, and state as a
	1088	TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
	1089	and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
	1090	if there are 2 entrys per node. eg:
	1091
	1092	A B A B
	1093	1. 2 4 1. 3 7
	1094	2. 0 3 3. 0 5
	1095	3. 0 0 5. 0 0
	1096	4. 0 0 7. 0 0
	1097
	1098	The table is internally in the right hand, idx form. However as we
	1099	also have to deal with the states array which is indexed by nodenum
	1100	we have to use TRIE_NODENUM() to convert.
	1101
	1102	*/
	1103	DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
	1104	depth+1));
	1105
	1106	trie->trans = (reg_trie_trans *)
	1107	PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
	1108	* trie->uniquecharcount + 1,
	1109	sizeof(reg_trie_trans) );
	1110	trie->states = (reg_trie_state *)
	1111	PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
	1112	sizeof(reg_trie_state) );
	1113	next_alloc = trie->uniquecharcount + 1;
	1114
	1115
	1116	for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
	1117
	1118	regnode *noper = REGNODE_AFTER( cur );
	1119
	1120	U32 state = 1; /* required init */
	1121
	1122	U16 charid = 0; /* sanity init */
	1123	U32 accept_state = 0; /* sanity init */
	1124
	1125	U32 wordlen = 0; /* required init */
	1126
	1127	if (OP(noper) == NOTHING) {
	1128	regnode *noper_next= regnext(noper);
	1129	if (noper_next < tail)
	1130	noper= noper_next;
	1131	/* we will undo this assignment if noper does not
	1132	* point at a trieable type in the else clause of
	1133	* the following statement. */
	1134	}
	1135
	1136	if ( noper < tail
	1137	&& ( OP(noper) == flags
	1138	\|\| (flags == EXACT && OP(noper) == EXACT_REQ8)
	1139	\|\| (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
	1140	\|\| OP(noper) == EXACTFUP))))
	1141	{
	1142	const U8 uc= (U8)STRING(noper);
	1143	const U8 *e= uc + STR_LEN(noper);
	1144
	1145	for ( ; uc < e ; uc += len ) {
	1146
	1147	TRIE_READ_CHAR;
	1148
	1149	if ( uvc < 256 ) {
	1150	charid = trie->charmap[ uvc ];
	1151	} else {
	1152	SV* const * const svpp = hv_fetch( widecharmap,
	1153	(char*)&uvc,
	1154	sizeof( UV ),
	1155	0);
	1156	charid = svpp ? (U16)SvIV(*svpp) : 0;
	1157	}
	1158	if ( charid ) {
	1159	charid--;
	1160	if ( !trie->trans[ state + charid ].next ) {
	1161	trie->trans[ state + charid ].next = next_alloc;
	1162	trie->trans[ state ].check++;
	1163	prev_states[TRIE_NODENUM(next_alloc)]
	1164	= TRIE_NODENUM(state);
	1165	next_alloc += trie->uniquecharcount;
	1166	}
	1167	state = trie->trans[ state + charid ].next;
	1168	} else {
	1169	Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
	1170	}
	1171	/* charid is now 0 if we dont know the char read, or
	1172	* nonzero if we do */
	1173	}
	1174	} else {
	1175	/* If we end up here it is because we skipped past a NOTHING, but did not end up
	1176	* on a trieable type. So we need to reset noper back to point at the first regop
	1177	* in the branch before we call TRIE_HANDLE_WORD().
	1178	*/
	1179	noper= REGNODE_AFTER(cur);
	1180	}
	1181	accept_state = TRIE_NODENUM( state );
	1182	TRIE_HANDLE_WORD(accept_state);
	1183
	1184	} /* end second pass */
	1185
	1186	/* and now dump it out before we compress it */
	1187	DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
	1188	revcharmap,
	1189	next_alloc, depth+1));
	1190
	1191	{
	1192	/*
	1193	* Inplace compress the table.*
	1194
	1195	For sparse data sets the table constructed by the trie algorithm will
	1196	be mostly 0/FAIL transitions or to put it another way mostly empty.
	1197	(Note that leaf nodes will not contain any transitions.)
	1198
	1199	This algorithm compresses the tables by eliminating most such
	1200	transitions, at the cost of a modest bit of extra work during lookup:
	1201
	1202	- Each states[] entry contains a .base field which indicates the
	1203	index in the state[] array wheres its transition data is stored.
	1204
	1205	- If .base is 0 there are no valid transitions from that node.
	1206
	1207	- If .base is nonzero then charid is added to it to find an entry in
	1208	the trans array.
	1209
	1210	-If trans[states[state].base+charid].check!=state then the
	1211	transition is taken to be a 0/Fail transition. Thus if there are fail
	1212	transitions at the front of the node then the .base offset will point
	1213	somewhere inside the previous nodes data (or maybe even into a node
	1214	even earlier), but the .check field determines if the transition is
	1215	valid.
	1216
	1217	XXX - wrong maybe?
	1218	The following process inplace converts the table to the compressed
	1219	table: We first do not compress the root node 1,and mark all its
	1220	.check pointers as 1 and set its .base pointer as 1 as well. This
	1221	allows us to do a DFA construction from the compressed table later,
	1222	and ensures that any .base pointers we calculate later are greater
	1223	than 0.
	1224
	1225	- We set 'pos' to indicate the first entry of the second node.
	1226
	1227	- We then iterate over the columns of the node, finding the first and
	1228	last used entry at l and m. We then copy l..m into pos..(pos+m-l),
	1229	and set the .check pointers accordingly, and advance pos
	1230	appropriately and repreat for the next node. Note that when we copy
	1231	the next pointers we have to convert them from the original
	1232	NODEIDX form to NODENUM form as the former is not valid post
	1233	compression.
	1234
	1235	- If a node has no transitions used we mark its base as 0 and do not
	1236	advance the pos pointer.
	1237
	1238	- If a node only has one transition we use a second pointer into the
	1239	structure to fill in allocated fail transitions from other states.
	1240	This pointer is independent of the main pointer and scans forward
	1241	looking for null transitions that are allocated to a state. When it
	1242	finds one it writes the single transition into the "hole". If the
	1243	pointer doesnt find one the single transition is appended as normal.
	1244
	1245	- Once compressed we can Renew/realloc the structures to release the
	1246	excess space.
	1247
	1248	See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
	1249	specifically Fig 3.47 and the associated pseudocode.
	1250
	1251	demq
	1252	*/
	1253	const U32 laststate = TRIE_NODENUM( next_alloc );
	1254	U32 state, charid;
	1255	U32 pos = 0, zp=0;
	1256	trie->statecount = laststate;
	1257
	1258	for ( state = 1 ; state < laststate ; state++ ) {
	1259	U8 flag = 0;
	1260	const U32 stateidx = TRIE_NODEIDX( state );
	1261	const U32 o_used = trie->trans[ stateidx ].check;
	1262	U32 used = trie->trans[ stateidx ].check;
	1263	trie->trans[ stateidx ].check = 0;
	1264
	1265	for ( charid = 0;
	1266	used && charid < trie->uniquecharcount;
	1267	charid++ )
	1268	{
	1269	if ( flag \|\| trie->trans[ stateidx + charid ].next ) {
	1270	if ( trie->trans[ stateidx + charid ].next ) {
	1271	if (o_used == 1) {
	1272	for ( ; zp < pos ; zp++ ) {
	1273	if ( ! trie->trans[ zp ].next ) {
	1274	break;
	1275	}
	1276	}
	1277	trie->states[ state ].trans.base
	1278	= zp
	1279	+ trie->uniquecharcount
	1280	- charid ;
	1281	trie->trans[ zp ].next
	1282	= SAFE_TRIE_NODENUM( trie->trans[ stateidx
	1283	+ charid ].next );
	1284	trie->trans[ zp ].check = state;
	1285	if ( ++zp > pos ) pos = zp;
	1286	break;
	1287	}
	1288	used--;
	1289	}
	1290	if ( !flag ) {
	1291	flag = 1;
	1292	trie->states[ state ].trans.base
	1293	= pos + trie->uniquecharcount - charid ;
	1294	}
	1295	trie->trans[ pos ].next
	1296	= SAFE_TRIE_NODENUM(
	1297	trie->trans[ stateidx + charid ].next );
	1298	trie->trans[ pos ].check = state;
	1299	pos++;
	1300	}
	1301	}
	1302	}
	1303	trie->lasttrans = pos + 1;
	1304	trie->states = (reg_trie_state *)
	1305	PerlMemShared_realloc( trie->states, laststate
	1306	* sizeof(reg_trie_state) );
	1307	DEBUG_TRIE_COMPILE_MORE_r(
	1308	Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
	1309	depth+1,
	1310	(int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
	1311	+ 1 ),
	1312	(IV)next_alloc,
	1313	(IV)pos,
	1314	( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
	1315	);
	1316
	1317	} /* end table compress */
	1318	}
	1319	DEBUG_TRIE_COMPILE_MORE_r(
	1320	Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
	1321	depth+1,
	1322	(UV)trie->statecount,
	1323	(UV)trie->lasttrans)
	1324	);
	1325	/* resize the trans array to remove unused space */
	1326	trie->trans = (reg_trie_trans *)
	1327	PerlMemShared_realloc( trie->trans, trie->lasttrans
	1328	* sizeof(reg_trie_trans) );
	1329
	1330	{ /* Modify the program and insert the new TRIE node */
	1331	U8 nodetype =(U8) flags;
	1332	char *str=NULL;
	1333
	1334	#ifdef DEBUGGING
	1335	regnode *optimize = NULL;
	1336	#endif /* DEBUGGING */
	1337	/* make sure we have enough room to inject the TRIE op */
	1338	assert((!trie->jump) \|\| !trie->jump[1] \|\|
	1339	(trie->jump[1] >= (sizeof(tregnode_TRIE)/sizeof(struct regnode))));
	1340	/*
	1341	This means we convert either the first branch or the first Exact,
	1342	depending on whether the thing following (in 'last') is a branch
	1343	or not and whther first is the startbranch (ie is it a sub part of
	1344	the alternation or is it the whole thing.)
	1345	Assuming its a sub part we convert the EXACT otherwise we convert
	1346	the whole branch sequence, including the first.
	1347	*/
	1348	/* Find the node we are going to overwrite */
	1349	if ( first != startbranch \|\| OP( last ) == BRANCH ) {
	1350	/* branch sub-chain */
	1351	NEXT_OFF( first ) = (U16)(last - first);
	1352	/* whole branch chain */
	1353	}
	1354	/* But first we check to see if there is a common prefix we can
	1355	split out as an EXACT and put in front of the TRIE node. */
	1356	trie->startstate= 1;
	1357	if ( trie->bitmap && !widecharmap && !trie->jump ) {
	1358	/* we want to find the first state that has more than
	1359	* one transition, if that state is not the first state
	1360	* then we have a common prefix which we can remove.
	1361	*/
	1362	U32 state;
	1363	for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
	1364	U32 ofs = 0;
	1365	I32 first_ofs = -1; /* keeps track of the ofs of the first
	1366	transition, -1 means none */
	1367	U32 count = 0;
	1368	const U32 base = trie->states[ state ].trans.base;
	1369
	1370	/* does this state terminate an alternation? */
	1371	if ( trie->states[state].wordnum )
	1372	count = 1;
	1373
	1374	for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
	1375	if ( ( base + ofs >= trie->uniquecharcount ) &&
	1376	( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
	1377	trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
	1378	{
	1379	if ( ++count > 1 ) {
	1380	/* we have more than one transition */
	1381	SV **tmp;
	1382	U8 *ch;
	1383	/* if this is the first state there is no common prefix
	1384	* to extract, so we can exit */
	1385	if ( state == 1 ) break;
	1386	tmp = av_fetch_simple( revcharmap, ofs, 0);
	1387	ch = (U8)SvPV_nolen_const( tmp );
	1388
	1389	/* if we are on count 2 then we need to initialize the
	1390	* bitmap, and store the previous char if there was one
	1391	* in it*/
	1392	if ( count == 2 ) {
	1393	/* clear the bitmap */
	1394	Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
	1395	DEBUG_OPTIMISE_r(
	1396	Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
	1397	depth+1,
	1398	(UV)state));
	1399	if (first_ofs >= 0) {
	1400	SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0);
	1401	const U8 * const ch = (U8)SvPV_nolen_const( tmp );
	1402
	1403	TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
	1404	DEBUG_OPTIMISE_r(
	1405	Perl_re_printf( aTHX_ "%s", (char*)ch)
	1406	);
	1407	}
	1408	}
	1409	/* store the current firstchar in the bitmap */
	1410	TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
	1411	DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
	1412	}
	1413	first_ofs = ofs;
	1414	}
	1415	}
	1416	if ( count == 1 ) {
	1417	/* This state has only one transition, its transition is part
	1418	* of a common prefix - we need to concatenate the char it
	1419	* represents to what we have so far. */
	1420	SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0);
	1421	STRLEN len;
	1422	char ch = SvPV( tmp, len );
	1423	DEBUG_OPTIMISE_r({
	1424	SV *sv=sv_newmortal();
	1425	Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
	1426	depth+1,
	1427	(UV)state, (UV)first_ofs,
	1428	pv_pretty(sv, SvPV_nolen_const(tmp), SvCUR(tmp), 6,
	1429	PL_colors[0], PL_colors[1],
	1430	(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) \|
	1431	PERL_PV_ESCAPE_FIRSTCHAR
	1432	)
	1433	);
	1434	});
	1435	if ( state==1 ) {
	1436	OP( convert ) = nodetype;
	1437	str=STRING(convert);
	1438	setSTR_LEN(convert, 0);
	1439	}
	1440	assert( ( STR_LEN(convert) + len ) < 256 );
	1441	setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
	1442	while (len--)
	1443	str++ = ch++;
	1444	} else {
	1445	#ifdef DEBUGGING
	1446	if (state>1)
	1447	DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
	1448	#endif
	1449	break;
	1450	}
	1451	}
	1452	trie->prefixlen = (state-1);
	1453	if (str) {
	1454	regnode *n = REGNODE_AFTER(convert);
	1455	assert( n - convert <= U16_MAX );
	1456	NEXT_OFF(convert) = n - convert;
	1457	trie->startstate = state;
	1458	trie->minlen -= (state - 1);
	1459	trie->maxlen -= (state - 1);
	1460	#ifdef DEBUGGING
	1461	/* At least the UNICOS C compiler choked on this
	1462	* being argument to DEBUG_r(), so let's just have
	1463	* it right here. */
	1464	if (
	1465	#ifdef PERL_EXT_RE_BUILD
	1466	1
	1467	#else
	1468	DEBUG_r_TEST
	1469	#endif
	1470	) {
	1471	U32 word = trie->wordcount;
	1472	while (word--) {
	1473	SV ** const tmp = av_fetch_simple( trie_words, word, 0 );
	1474	if (tmp) {
	1475	if ( STR_LEN(convert) <= SvCUR(*tmp) )
	1476	sv_chop(tmp, SvPV_nolen(tmp) + STR_LEN(convert));
	1477	else
	1478	sv_chop(tmp, SvPV_nolen(tmp) + SvCUR(*tmp));
	1479	}
	1480	}
	1481	}
	1482	#endif
	1483	if (trie->maxlen) {
	1484	convert = n;
	1485	} else {
	1486	NEXT_OFF(convert) = (U16)(tail - convert);
	1487	DEBUG_r(optimize= n);
	1488	}
	1489	}
	1490	}
	1491	if (!jumper)
	1492	jumper = last;
	1493	if ( trie->maxlen ) {
	1494	NEXT_OFF( convert ) = (U16)(tail - convert);
	1495	ARG1u_SET( convert, data_slot );
	1496	/* Store the offset to the first unabsorbed branch in
	1497	jump[0], which is otherwise unused by the jump logic.
	1498	We use this when dumping a trie and during optimisation. */
	1499	if (trie->jump)
	1500	trie->jump[0] = (U16)(nextbranch - convert);
	1501
	1502	/* If the start state is not accepting (meaning there is no empty string/NOTHING)
	1503	* and there is a bitmap
	1504	* and the first "jump target" node we found leaves enough room
	1505	* then convert the TRIE node into a TRIEC node, with the bitmap
	1506	* embedded inline in the opcode - this is hypothetically faster.
	1507	*/
	1508	if ( !trie->states[trie->startstate].wordnum
	1509	&& trie->bitmap
	1510	&& ( (char )jumper - (char )convert) >= (int)sizeof(tregnode_TRIEC) )
	1511	{
	1512	OP( convert ) = TRIEC;
	1513	Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
	1514	PerlMemShared_free(trie->bitmap);
	1515	trie->bitmap= NULL;
	1516	} else
	1517	OP( convert ) = TRIE;
	1518
	1519	/* store the type in the flags */
	1520	FLAGS(convert) = nodetype;
	1521	DEBUG_r({
	1522	optimize = convert
	1523	+ NODE_STEP_REGNODE
	1524	+ REGNODE_ARG_LEN( OP( convert ) );
	1525	});
	1526	/* XXX We really should free up the resource in trie now,
	1527	as we won't use them - (which resources?) dmq */
	1528	}
	1529	/* needed for dumping*/
	1530	DEBUG_r(if (optimize) {
	1531	/*
	1532	Try to clean up some of the debris left after the
	1533	optimisation.
	1534	*/
	1535	while( optimize < jumper ) {
	1536	OP( optimize ) = OPTIMIZED;
	1537	optimize++;
	1538	}
	1539	});
	1540	} /* end node insert */
	1541
	1542	/* Finish populating the prev field of the wordinfo array. Walk back
	1543	* from each accept state until we find another accept state, and if
	1544	* so, point the first word's .prev field at the second word. If the
	1545	* second already has a .prev field set, stop now. This will be the
	1546	* case either if we've already processed that word's accept state,
	1547	* or that state had multiple words, and the overspill words were
	1548	* already linked up earlier.
	1549	*/
	1550	{
	1551	U16 word;
	1552	U32 state;
	1553	U16 prev;
	1554
	1555	for (word=1; word <= trie->wordcount; word++) {
	1556	prev = 0;
	1557	if (trie->wordinfo[word].prev)
	1558	continue;
	1559	state = trie->wordinfo[word].accept;
	1560	while (state) {
	1561	state = prev_states[state];
	1562	if (!state)
	1563	break;
	1564	prev = trie->states[state].wordnum;
	1565	if (prev)
	1566	break;
	1567	}
	1568	trie->wordinfo[word].prev = prev;
	1569	}
	1570	Safefree(prev_states);
	1571	}
	1572
	1573
	1574	/* and now dump out the compressed format */
	1575	DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
	1576
	1577	RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
	1578	#ifdef DEBUGGING
	1579	RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
	1580	RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
	1581	#else
	1582	SvREFCNT_dec_NN(revcharmap);
	1583	#endif
	1584	return trie->jump
	1585	? MADE_JUMP_TRIE
	1586	: trie->startstate>1
	1587	? MADE_EXACT_TRIE
	1588	: MADE_TRIE;
	1589	}
	1590
	1591	regnode *
	1592	Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t pRExC_state, regnode source, U32 depth)
	1593	{
	1594	/* The Trie is constructed and compressed now so we can build a fail array if
	1595	* it's needed
	1596
	1597	This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
	1598	3.32 in the
	1599	"Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
	1600	Ullman 1985/88
	1601	ISBN 0-201-10088-6
	1602
	1603	We find the fail state for each state in the trie, this state is the longest
	1604	proper suffix of the current state's 'word' that is also a proper prefix of
	1605	another word in our trie. State 1 represents the word '' and is thus the
	1606	default fail state. This allows the DFA not to have to restart after its
	1607	tried and failed a word at a given point, it simply continues as though it
	1608	had been matching the other word in the first place.
	1609	Consider
	1610	'abcdgu'=~/abcdefg\|cdgu/
	1611	When we get to 'd' we are still matching the first word, we would encounter
	1612	'g' which would fail, which would bring us to the state representing 'd' in
	1613	the second word where we would try 'g' and succeed, proceeding to match
	1614	'cdgu'.
	1615	*/
	1616	/* add a fail transition */
	1617	const U32 trie_offset = ARG1u(source);
	1618	reg_trie_data trie=(reg_trie_data )RExC_rxi->data->data[trie_offset];
	1619	U32 *q;
	1620	const U32 ucharcount = trie->uniquecharcount;
	1621	const U32 numstates = trie->statecount;
	1622	const U32 ubound = trie->lasttrans + ucharcount;
	1623	U32 q_read = 0;
	1624	U32 q_write = 0;
	1625	U32 charid;
	1626	U32 base = trie->states[ 1 ].trans.base;
	1627	U32 *fail;
	1628	reg_ac_data *aho;
	1629	const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("T"));
	1630	regnode *stclass;
	1631	DECLARE_AND_GET_RE_DEBUG_FLAGS;
	1632
	1633	PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
	1634	PERL_UNUSED_CONTEXT;
	1635	#ifndef DEBUGGING
	1636	PERL_UNUSED_ARG(depth);
	1637	#endif
	1638
	1639	if ( OP(source) == TRIE ) {
	1640	tregnode_TRIE op = (tregnode_TRIE )
	1641	PerlMemShared_calloc(1, sizeof(tregnode_TRIE));
	1642	StructCopy(source, op, tregnode_TRIE);
	1643	stclass = (regnode *)op;
	1644	} else {
	1645	tregnode_TRIEC op = (tregnode_TRIEC )
	1646	PerlMemShared_calloc(1, sizeof(tregnode_TRIEC));
	1647	StructCopy(source, op, tregnode_TRIEC);
	1648	stclass = (regnode *)op;
	1649	}
	1650	OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
	1651
	1652	ARG1u_SET( stclass, data_slot );
	1653	aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
	1654	RExC_rxi->data->data[ data_slot ] = (void*)aho;
	1655	aho->trie=trie_offset;
	1656	aho->states=(reg_trie_state )PerlMemShared_malloc( numstates sizeof(reg_trie_state) );
	1657	Copy( trie->states, aho->states, numstates, reg_trie_state );
	1658	Newx( q, numstates, U32);
	1659	aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
	1660	aho->refcount = 1;
	1661	fail = aho->fail;
	1662	/* initialize fail[0..1] to be 1 so that we always have
	1663	a valid final fail state */
	1664	fail[ 0 ] = fail[ 1 ] = 1;
	1665
	1666	for ( charid = 0; charid < ucharcount ; charid++ ) {
	1667	const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
	1668	if ( newstate ) {
	1669	q[ q_write ] = newstate;
	1670	/* set to point at the root */
	1671	fail[ q[ q_write++ ] ]=1;
	1672	}
	1673	}
	1674	while ( q_read < q_write) {
	1675	const U32 cur = q[ q_read++ % numstates ];
	1676	base = trie->states[ cur ].trans.base;
	1677
	1678	for ( charid = 0 ; charid < ucharcount ; charid++ ) {
	1679	const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
	1680	if (ch_state) {
	1681	U32 fail_state = cur;
	1682	U32 fail_base;
	1683	do {
	1684	fail_state = fail[ fail_state ];
	1685	fail_base = aho->states[ fail_state ].trans.base;
	1686	} while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
	1687
	1688	fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
	1689	fail[ ch_state ] = fail_state;
	1690	if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
	1691	{
	1692	aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
	1693	}
	1694	q[ q_write++ % numstates] = ch_state;
	1695	}
	1696	}
	1697	}
	1698	/* restore fail[0..1] to 0 so that we "fall out" of the AC loop
	1699	when we fail in state 1, this allows us to use the
	1700	charclass scan to find a valid start char. This is based on the principle
	1701	that theres a good chance the string being searched contains lots of stuff
	1702	that cant be a start char.
	1703	*/
	1704	fail[ 0 ] = fail[ 1 ] = 0;
	1705	DEBUG_TRIE_COMPILE_r({
	1706	Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
	1707	depth, (UV)numstates
	1708	);
	1709	for( q_read=1; q_read<numstates; q_read++ ) {
	1710	Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
	1711	}
	1712	Perl_re_printf( aTHX_ "\n");
	1713	});
	1714	Safefree(q);
	1715	/RExC_seen \|= REG_TRIEDFA_SEEN;/
	1716	return stclass;
	1717	}