This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #40718] perl parser bug leading to memory corruption
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
e2e6a0f1 117 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
118 I32 extralen;
119 I32 seen_zerolen;
120 I32 seen_evals;
40d049e4
YO
121 regnode **open_parens; /* pointers to open parens */
122 regnode **close_parens; /* pointers to close parens */
123 regnode *opend; /* END node in program */
1aa99e6b 124 I32 utf8;
6bda09f9 125 HV *charnames; /* cache of named sequences */
81714fb9 126 HV *paren_names; /* Paren names */
40d049e4
YO
127 regnode **recurse; /* Recurse regops */
128 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
3dab1dad 133#ifdef DEBUGGING
be8e71aa 134 const char *lastparse;
3dab1dad
YO
135 I32 lastnum;
136#define RExC_lastparse (pRExC_state->lastparse)
137#define RExC_lastnum (pRExC_state->lastnum)
138#endif
830247a4
IZ
139} RExC_state_t;
140
e2509266 141#define RExC_flags (pRExC_state->flags)
830247a4
IZ
142#define RExC_precomp (pRExC_state->precomp)
143#define RExC_rx (pRExC_state->rx)
fac92740 144#define RExC_start (pRExC_state->start)
830247a4
IZ
145#define RExC_end (pRExC_state->end)
146#define RExC_parse (pRExC_state->parse)
147#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 148#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 149#define RExC_emit (pRExC_state->emit)
fac92740 150#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
151#define RExC_naughty (pRExC_state->naughty)
152#define RExC_sawback (pRExC_state->sawback)
153#define RExC_seen (pRExC_state->seen)
154#define RExC_size (pRExC_state->size)
155#define RExC_npar (pRExC_state->npar)
e2e6a0f1 156#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
157#define RExC_extralen (pRExC_state->extralen)
158#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 160#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 161#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
162#define RExC_open_parens (pRExC_state->open_parens)
163#define RExC_close_parens (pRExC_state->close_parens)
164#define RExC_opend (pRExC_state->opend)
81714fb9 165#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
166#define RExC_recurse (pRExC_state->recurse)
167#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 168
a687059c
LW
169#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
170#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
171 ((*s) == '{' && regcurly(s)))
a687059c 172
35c8bce7
LW
173#ifdef SPSTART
174#undef SPSTART /* dratted cpp namespace... */
175#endif
a687059c
LW
176/*
177 * Flags to be passed up and down.
178 */
a687059c 179#define WORST 0 /* Worst case. */
821b33a5 180#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
181#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
182#define SPSTART 0x4 /* Starts with * or +. */
183#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 184
3dab1dad
YO
185#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
186
07be1b83
YO
187/* whether trie related optimizations are enabled */
188#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
189#define TRIE_STUDY_OPT
786e8c11 190#define FULL_TRIE_STUDY
07be1b83
YO
191#define TRIE_STCLASS
192#endif
1de06328
YO
193
194
40d049e4
YO
195
196#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
197#define PBITVAL(paren) (1 << ((paren) & 7))
198#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
199#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
200#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
201
202
1de06328
YO
203/* About scan_data_t.
204
205 During optimisation we recurse through the regexp program performing
206 various inplace (keyhole style) optimisations. In addition study_chunk
207 and scan_commit populate this data structure with information about
208 what strings MUST appear in the pattern. We look for the longest
209 string that must appear for at a fixed location, and we look for the
210 longest string that may appear at a floating location. So for instance
211 in the pattern:
212
213 /FOO[xX]A.*B[xX]BAR/
214
215 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
216 strings (because they follow a .* construct). study_chunk will identify
217 both FOO and BAR as being the longest fixed and floating strings respectively.
218
219 The strings can be composites, for instance
220
221 /(f)(o)(o)/
222
223 will result in a composite fixed substring 'foo'.
224
225 For each string some basic information is maintained:
226
227 - offset or min_offset
228 This is the position the string must appear at, or not before.
229 It also implicitly (when combined with minlenp) tells us how many
230 character must match before the string we are searching.
231 Likewise when combined with minlenp and the length of the string
232 tells us how many characters must appear after the string we have
233 found.
234
235 - max_offset
236 Only used for floating strings. This is the rightmost point that
237 the string can appear at. Ifset to I32 max it indicates that the
238 string can occur infinitely far to the right.
239
240 - minlenp
241 A pointer to the minimum length of the pattern that the string
242 was found inside. This is important as in the case of positive
243 lookahead or positive lookbehind we can have multiple patterns
244 involved. Consider
245
246 /(?=FOO).*F/
247
248 The minimum length of the pattern overall is 3, the minimum length
249 of the lookahead part is 3, but the minimum length of the part that
250 will actually match is 1. So 'FOO's minimum length is 3, but the
251 minimum length for the F is 1. This is important as the minimum length
252 is used to determine offsets in front of and behind the string being
253 looked for. Since strings can be composites this is the length of the
254 pattern at the time it was commited with a scan_commit. Note that
255 the length is calculated by study_chunk, so that the minimum lengths
256 are not known until the full pattern has been compiled, thus the
257 pointer to the value.
258
259 - lookbehind
260
261 In the case of lookbehind the string being searched for can be
262 offset past the start point of the final matching string.
263 If this value was just blithely removed from the min_offset it would
264 invalidate some of the calculations for how many chars must match
265 before or after (as they are derived from min_offset and minlen and
266 the length of the string being searched for).
267 When the final pattern is compiled and the data is moved from the
268 scan_data_t structure into the regexp structure the information
269 about lookbehind is factored in, with the information that would
270 have been lost precalculated in the end_shift field for the
271 associated string.
272
273 The fields pos_min and pos_delta are used to store the minimum offset
274 and the delta to the maximum offset at the current point in the pattern.
275
276*/
2c2d71f5
JH
277
278typedef struct scan_data_t {
1de06328
YO
279 /*I32 len_min; unused */
280 /*I32 len_delta; unused */
2c2d71f5
JH
281 I32 pos_min;
282 I32 pos_delta;
283 SV *last_found;
1de06328 284 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
285 I32 last_start_min;
286 I32 last_start_max;
1de06328
YO
287 SV **longest; /* Either &l_fixed, or &l_float. */
288 SV *longest_fixed; /* longest fixed string found in pattern */
289 I32 offset_fixed; /* offset where it starts */
290 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
291 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
292 SV *longest_float; /* longest floating string found in pattern */
293 I32 offset_float_min; /* earliest point in string it can appear */
294 I32 offset_float_max; /* latest point in string it can appear */
295 I32 *minlen_float; /* pointer to the minlen relevent to the string */
296 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
297 I32 flags;
298 I32 whilem_c;
cb434fcc 299 I32 *last_closep;
653099ff 300 struct regnode_charclass_class *start_class;
2c2d71f5
JH
301} scan_data_t;
302
a687059c 303/*
e50aee73 304 * Forward declarations for pregcomp()'s friends.
a687059c 305 */
a0d0e21e 306
27da23d5 307static const scan_data_t zero_scan_data =
1de06328 308 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
309
310#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
311#define SF_BEFORE_SEOL 0x0001
312#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
313#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
314#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
315
09b7f37c
CB
316#ifdef NO_UNARY_PLUS
317# define SF_FIX_SHIFT_EOL (0+2)
318# define SF_FL_SHIFT_EOL (0+4)
319#else
320# define SF_FIX_SHIFT_EOL (+2)
321# define SF_FL_SHIFT_EOL (+4)
322#endif
c277df42
IZ
323
324#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
325#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
326
327#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
328#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
329#define SF_IS_INF 0x0040
330#define SF_HAS_PAR 0x0080
331#define SF_IN_PAR 0x0100
332#define SF_HAS_EVAL 0x0200
333#define SCF_DO_SUBSTR 0x0400
653099ff
GS
334#define SCF_DO_STCLASS_AND 0x0800
335#define SCF_DO_STCLASS_OR 0x1000
336#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 337#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 338
786e8c11 339#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 340#define SCF_SEEN_ACCEPT 0x8000
07be1b83 341
eb160463 342#define UTF (RExC_utf8 != 0)
e2509266
JH
343#define LOC ((RExC_flags & PMf_LOCALE) != 0)
344#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 345
ffc61ed2 346#define OOB_UNICODE 12345678
93733859 347#define OOB_NAMEDCLASS -1
b8c5462f 348
a0ed51b3
LW
349#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
350#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
351
8615cb43 352
b45f050a
JF
353/* length of regex to show in messages that don't mark a position within */
354#define RegexLengthToShowInErrorMessages 127
355
356/*
357 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
358 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
359 * op/pragma/warn/regcomp.
360 */
7253e4e3
RK
361#define MARKER1 "<-- HERE" /* marker as it appears in the description */
362#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 363
7253e4e3 364#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
365
366/*
367 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
368 * arg. Show regex, up to a maximum length. If it's too long, chop and add
369 * "...".
370 */
ccb2c380 371#define FAIL(msg) STMT_START { \
bfed75c6 372 const char *ellipses = ""; \
ccb2c380
MP
373 IV len = RExC_end - RExC_precomp; \
374 \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 if (len > RegexLengthToShowInErrorMessages) { \
378 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
379 len = RegexLengthToShowInErrorMessages - 10; \
380 ellipses = "..."; \
381 } \
382 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
383 msg, (int)len, RExC_precomp, ellipses); \
384} STMT_END
8615cb43 385
b45f050a 386/*
b45f050a
JF
387 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
388 */
ccb2c380 389#define Simple_vFAIL(m) STMT_START { \
a28509cc 390 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
391 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
392 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
b45f050a
JF
394
395/*
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
397 */
ccb2c380
MP
398#define vFAIL(m) STMT_START { \
399 if (!SIZE_ONLY) \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
401 Simple_vFAIL(m); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Like Simple_vFAIL(), but accepts two arguments.
406 */
ccb2c380 407#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 408 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
409 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
410 (int)offset, RExC_precomp, RExC_precomp + offset); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
415 */
ccb2c380
MP
416#define vFAIL2(m,a1) STMT_START { \
417 if (!SIZE_ONLY) \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 Simple_vFAIL2(m, a1); \
420} STMT_END
b45f050a
JF
421
422
423/*
424 * Like Simple_vFAIL(), but accepts three arguments.
425 */
ccb2c380 426#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
434 */
ccb2c380
MP
435#define vFAIL3(m,a1,a2) STMT_START { \
436 if (!SIZE_ONLY) \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
438 Simple_vFAIL3(m, a1, a2); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts four arguments.
443 */
ccb2c380 444#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a 449
ccb2c380 450#define vWARN(loc,m) STMT_START { \
a28509cc 451 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
455
456#define vWARNdep(loc,m) STMT_START { \
a28509cc 457 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
458 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
459 "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
461} STMT_END
462
463
464#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 471 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
473 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
480} STMT_END
481
482#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
485 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
9d1d55b5 487
8615cb43 488
cd439c50 489/* Allow for side effects in s */
ccb2c380
MP
490#define REGC(c,s) STMT_START { \
491 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
492} STMT_END
cd439c50 493
fac92740
MJD
494/* Macros for recording node offsets. 20001227 mjd@plover.com
495 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
496 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
497 * Element 0 holds the number n.
07be1b83 498 * Position is 1 indexed.
fac92740
MJD
499 */
500
ccb2c380
MP
501#define Set_Node_Offset_To_R(node,byte) STMT_START { \
502 if (! SIZE_ONLY) { \
503 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 504 __LINE__, (node), (int)(byte))); \
ccb2c380 505 if((node) < 0) { \
551405c4 506 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
507 } else { \
508 RExC_offsets[2*(node)-1] = (byte); \
509 } \
510 } \
511} STMT_END
512
513#define Set_Node_Offset(node,byte) \
514 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
515#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
516
517#define Set_Node_Length_To_R(node,len) STMT_START { \
518 if (! SIZE_ONLY) { \
519 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 520 __LINE__, (int)(node), (int)(len))); \
ccb2c380 521 if((node) < 0) { \
551405c4 522 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
523 } else { \
524 RExC_offsets[2*(node)] = (len); \
525 } \
526 } \
527} STMT_END
528
529#define Set_Node_Length(node,len) \
530 Set_Node_Length_To_R((node)-RExC_emit_start, len)
531#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
532#define Set_Node_Cur_Length(node) \
533 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
534
535/* Get offsets and lengths */
536#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
537#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
538
07be1b83
YO
539#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
540 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
541 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
542} STMT_END
543
544
545#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
546#define EXPERIMENTAL_INPLACESCAN
547#endif
548
1de06328 549#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 550DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
551 PerlIO_printf(Perl_debug_log, \
552 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
553 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
554 (int)(depth)*2, "", \
555 (IV)((data)->pos_min), \
556 (IV)((data)->pos_delta), \
557 (IV)((data)->flags), \
558 (IV)((data)->whilem_c), \
559 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
560 ); \
561 if ((data)->last_found) \
562 PerlIO_printf(Perl_debug_log, \
563 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
564 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
565 SvPVX_const((data)->last_found), \
566 (IV)((data)->last_end), \
567 (IV)((data)->last_start_min), \
568 (IV)((data)->last_start_max), \
569 ((data)->longest && \
570 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
571 SvPVX_const((data)->longest_fixed), \
572 (IV)((data)->offset_fixed), \
573 ((data)->longest && \
574 (data)->longest==&((data)->longest_float)) ? "*" : "", \
575 SvPVX_const((data)->longest_float), \
576 (IV)((data)->offset_float_min), \
577 (IV)((data)->offset_float_max) \
578 ); \
579 PerlIO_printf(Perl_debug_log,"\n"); \
580});
581
acfe0abc 582static void clear_re(pTHX_ void *r);
4327152a 583
653099ff 584/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 585 Update the longest found anchored substring and the longest found
653099ff
GS
586 floating substrings if needed. */
587
4327152a 588STATIC void
1de06328 589S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 590{
e1ec3a88
AL
591 const STRLEN l = CHR_SVLEN(data->last_found);
592 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 593 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 594
c277df42 595 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 596 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
597 if (*data->longest == data->longest_fixed) {
598 data->offset_fixed = l ? data->last_start_min : data->pos_min;
599 if (data->flags & SF_BEFORE_EOL)
b81d288d 600 data->flags
c277df42
IZ
601 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
602 else
603 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
604 data->minlen_fixed=minlenp;
605 data->lookbehind_fixed=0;
a0ed51b3
LW
606 }
607 else {
c277df42 608 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
609 data->offset_float_max = (l
610 ? data->last_start_max
c277df42 611 : data->pos_min + data->pos_delta);
9051bda5
HS
612 if ((U32)data->offset_float_max > (U32)I32_MAX)
613 data->offset_float_max = I32_MAX;
c277df42 614 if (data->flags & SF_BEFORE_EOL)
b81d288d 615 data->flags
c277df42
IZ
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
619 data->minlen_float=minlenp;
620 data->lookbehind_float=0;
c277df42
IZ
621 }
622 }
623 SvCUR_set(data->last_found, 0);
0eda9292 624 {
a28509cc 625 SV * const sv = data->last_found;
097eb12c
AL
626 if (SvUTF8(sv) && SvMAGICAL(sv)) {
627 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
628 if (mg)
629 mg->mg_len = 0;
630 }
0eda9292 631 }
c277df42
IZ
632 data->last_end = -1;
633 data->flags &= ~SF_BEFORE_EOL;
1de06328 634 DEBUG_STUDYDATA(data,0);
c277df42
IZ
635}
636
653099ff
GS
637/* Can match anything (initialization) */
638STATIC void
097eb12c 639S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 640{
653099ff 641 ANYOF_CLASS_ZERO(cl);
f8bef550 642 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 643 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
644 if (LOC)
645 cl->flags |= ANYOF_LOCALE;
646}
647
648/* Can match anything (initialization) */
649STATIC int
5f66b61c 650S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
651{
652 int value;
653
aaa51d5e 654 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
655 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
656 return 1;
1aa99e6b
IH
657 if (!(cl->flags & ANYOF_UNICODE_ALL))
658 return 0;
10edeb5d 659 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 660 return 0;
653099ff
GS
661 return 1;
662}
663
664/* Can match anything (initialization) */
665STATIC void
097eb12c 666S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 667{
8ecf7187 668 Zero(cl, 1, struct regnode_charclass_class);
653099ff 669 cl->type = ANYOF;
830247a4 670 cl_anything(pRExC_state, cl);
653099ff
GS
671}
672
673STATIC void
097eb12c 674S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 675{
8ecf7187 676 Zero(cl, 1, struct regnode_charclass_class);
653099ff 677 cl->type = ANYOF;
830247a4 678 cl_anything(pRExC_state, cl);
653099ff
GS
679 if (LOC)
680 cl->flags |= ANYOF_LOCALE;
681}
682
683/* 'And' a given class with another one. Can create false positives */
684/* We assume that cl is not inverted */
685STATIC void
5f66b61c 686S_cl_and(struct regnode_charclass_class *cl,
a28509cc 687 const struct regnode_charclass_class *and_with)
653099ff 688{
40d049e4
YO
689
690 assert(and_with->type == ANYOF);
653099ff
GS
691 if (!(and_with->flags & ANYOF_CLASS)
692 && !(cl->flags & ANYOF_CLASS)
693 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
694 && !(and_with->flags & ANYOF_FOLD)
695 && !(cl->flags & ANYOF_FOLD)) {
696 int i;
697
698 if (and_with->flags & ANYOF_INVERT)
699 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
700 cl->bitmap[i] &= ~and_with->bitmap[i];
701 else
702 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
703 cl->bitmap[i] &= and_with->bitmap[i];
704 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
705 if (!(and_with->flags & ANYOF_EOS))
706 cl->flags &= ~ANYOF_EOS;
1aa99e6b 707
14ebb1a2
JH
708 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
709 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
710 cl->flags &= ~ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_UNICODE;
712 ARG_SET(cl, ARG(and_with));
713 }
14ebb1a2
JH
714 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
715 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 716 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
717 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
718 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 719 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
720}
721
722/* 'OR' a given class with another one. Can create false positives */
723/* We assume that cl is not inverted */
724STATIC void
097eb12c 725S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 726{
653099ff
GS
727 if (or_with->flags & ANYOF_INVERT) {
728 /* We do not use
729 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
730 * <= (B1 | !B2) | (CL1 | !CL2)
731 * which is wasteful if CL2 is small, but we ignore CL2:
732 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
733 * XXXX Can we handle case-fold? Unclear:
734 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
735 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
736 */
737 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
738 && !(or_with->flags & ANYOF_FOLD)
739 && !(cl->flags & ANYOF_FOLD) ) {
740 int i;
741
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] |= ~or_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise */
745 else {
830247a4 746 cl_anything(pRExC_state, cl);
653099ff
GS
747 }
748 } else {
749 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
750 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 751 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
752 || (cl->flags & ANYOF_FOLD)) ) {
753 int i;
754
755 /* OR char bitmap and class bitmap separately */
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= or_with->bitmap[i];
758 if (or_with->flags & ANYOF_CLASS) {
759 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
760 cl->classflags[i] |= or_with->classflags[i];
761 cl->flags |= ANYOF_CLASS;
762 }
763 }
764 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 765 cl_anything(pRExC_state, cl);
653099ff
GS
766 }
767 }
768 if (or_with->flags & ANYOF_EOS)
769 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
770
771 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
772 ARG(cl) != ARG(or_with)) {
773 cl->flags |= ANYOF_UNICODE_ALL;
774 cl->flags &= ~ANYOF_UNICODE;
775 }
776 if (or_with->flags & ANYOF_UNICODE_ALL) {
777 cl->flags |= ANYOF_UNICODE_ALL;
778 cl->flags &= ~ANYOF_UNICODE;
779 }
653099ff
GS
780}
781
a3621e74
YO
782#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
783#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
784#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
785#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
786
3dab1dad
YO
787
788#ifdef DEBUGGING
07be1b83 789/*
3dab1dad
YO
790 dump_trie(trie)
791 dump_trie_interim_list(trie,next_alloc)
792 dump_trie_interim_table(trie,next_alloc)
793
794 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
795 The _interim_ variants are used for debugging the interim
796 tables that are used to generate the final compressed
797 representation which is what dump_trie expects.
798
3dab1dad
YO
799 Part of the reason for their existance is to provide a form
800 of documentation as to how the different representations function.
07be1b83
YO
801
802*/
3dab1dad
YO
803
804/*
805 dump_trie(trie)
806 Dumps the final compressed table form of the trie to Perl_debug_log.
807 Used for debugging make_trie().
808*/
809
810STATIC void
811S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
812{
813 U32 state;
ab3bbdeb
YO
814 SV *sv=sv_newmortal();
815 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
816 GET_RE_DEBUG_FLAGS_DECL;
817
ab3bbdeb 818
3dab1dad
YO
819 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
820 (int)depth * 2 + 2,"",
821 "Match","Base","Ofs" );
822
823 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 824 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 825 if ( tmp ) {
ab3bbdeb
YO
826 PerlIO_printf( Perl_debug_log, "%*s",
827 colwidth,
ddc5bc0f 828 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
829 PL_colors[0], PL_colors[1],
830 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
831 PERL_PV_ESCAPE_FIRSTCHAR
832 )
833 );
3dab1dad
YO
834 }
835 }
836 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
837 (int)depth * 2 + 2,"");
838
839 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 840 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
841 PerlIO_printf( Perl_debug_log, "\n");
842
1e2e3d02 843 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 844 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
845
846 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
847
848 if ( trie->states[ state ].wordnum ) {
849 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
850 } else {
851 PerlIO_printf( Perl_debug_log, "%6s", "" );
852 }
853
854 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
855
856 if ( base ) {
857 U32 ofs = 0;
858
859 while( ( base + ofs < trie->uniquecharcount ) ||
860 ( base + ofs - trie->uniquecharcount < trie->lasttrans
861 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
862 ofs++;
863
864 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
865
866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
867 if ( ( base + ofs >= trie->uniquecharcount ) &&
868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
870 {
ab3bbdeb
YO
871 PerlIO_printf( Perl_debug_log, "%*"UVXf,
872 colwidth,
3dab1dad
YO
873 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
874 } else {
ab3bbdeb 875 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
876 }
877 }
878
879 PerlIO_printf( Perl_debug_log, "]");
880
881 }
882 PerlIO_printf( Perl_debug_log, "\n" );
883 }
884}
885/*
886 dump_trie_interim_list(trie,next_alloc)
887 Dumps a fully constructed but uncompressed trie in list form.
888 List tries normally only are used for construction when the number of
889 possible chars (trie->uniquecharcount) is very high.
890 Used for debugging make_trie().
891*/
892STATIC void
893S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
894{
895 U32 state;
ab3bbdeb
YO
896 SV *sv=sv_newmortal();
897 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
898 GET_RE_DEBUG_FLAGS_DECL;
899 /* print out the table precompression. */
ab3bbdeb
YO
900 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
901 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
902 "------:-----+-----------------\n" );
3dab1dad
YO
903
904 for( state=1 ; state < next_alloc ; state ++ ) {
905 U16 charid;
906
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
908 (int)depth * 2 + 2,"", (UV)state );
909 if ( ! trie->states[ state ].wordnum ) {
910 PerlIO_printf( Perl_debug_log, "%5s| ","");
911 } else {
912 PerlIO_printf( Perl_debug_log, "W%4x| ",
913 trie->states[ state ].wordnum
914 );
915 }
916 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 917 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
918 if ( tmp ) {
919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
920 colwidth,
ddc5bc0f 921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
922 PL_colors[0], PL_colors[1],
923 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
924 PERL_PV_ESCAPE_FIRSTCHAR
925 ) ,
1e2e3d02
YO
926 TRIE_LIST_ITEM(state,charid).forid,
927 (UV)TRIE_LIST_ITEM(state,charid).newstate
928 );
929 if (!(charid % 10))
664e119d
RGS
930 PerlIO_printf(Perl_debug_log, "\n%*s| ",
931 (int)((depth * 2) + 14), "");
1e2e3d02 932 }
ab3bbdeb
YO
933 }
934 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
935 }
936}
937
938/*
939 dump_trie_interim_table(trie,next_alloc)
940 Dumps a fully constructed but uncompressed trie in table form.
941 This is the normal DFA style state transition table, with a few
942 twists to facilitate compression later.
943 Used for debugging make_trie().
944*/
945STATIC void
946S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
947{
948 U32 state;
949 U16 charid;
ab3bbdeb
YO
950 SV *sv=sv_newmortal();
951 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
952 GET_RE_DEBUG_FLAGS_DECL;
953
954 /*
955 print out the table precompression so that we can do a visual check
956 that they are identical.
957 */
958
959 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
960
961 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 962 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 963 if ( tmp ) {
ab3bbdeb
YO
964 PerlIO_printf( Perl_debug_log, "%*s",
965 colwidth,
ddc5bc0f 966 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
967 PL_colors[0], PL_colors[1],
968 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
969 PERL_PV_ESCAPE_FIRSTCHAR
970 )
971 );
3dab1dad
YO
972 }
973 }
974
975 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
976
977 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 978 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
979 }
980
981 PerlIO_printf( Perl_debug_log, "\n" );
982
983 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
984
985 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
986 (int)depth * 2 + 2,"",
987 (UV)TRIE_NODENUM( state ) );
988
989 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
990 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
991 if (v)
992 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
993 else
994 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
995 }
996 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
998 } else {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1000 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1001 }
1002 }
07be1b83 1003}
3dab1dad
YO
1004
1005#endif
1006
786e8c11
YO
1007/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1008 startbranch: the first branch in the whole branch sequence
1009 first : start branch of sequence of branch-exact nodes.
1010 May be the same as startbranch
1011 last : Thing following the last branch.
1012 May be the same as tail.
1013 tail : item following the branch sequence
1014 count : words in the sequence
1015 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1016 depth : indent depth
3dab1dad 1017
786e8c11 1018Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1019
786e8c11
YO
1020A trie is an N'ary tree where the branches are determined by digital
1021decomposition of the key. IE, at the root node you look up the 1st character and
1022follow that branch repeat until you find the end of the branches. Nodes can be
1023marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1024
786e8c11 1025 /he|she|his|hers/
72f13be8 1026
786e8c11
YO
1027would convert into the following structure. Numbers represent states, letters
1028following numbers represent valid transitions on the letter from that state, if
1029the number is in square brackets it represents an accepting state, otherwise it
1030will be in parenthesis.
07be1b83 1031
786e8c11
YO
1032 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1033 | |
1034 | (2)
1035 | |
1036 (1) +-i->(6)-+-s->[7]
1037 |
1038 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1039
786e8c11
YO
1040 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1041
1042This shows that when matching against the string 'hers' we will begin at state 1
1043read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1044then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1045is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1046single traverse. We store a mapping from accepting to state to which word was
1047matched, and then when we have multiple possibilities we try to complete the
1048rest of the regex in the order in which they occured in the alternation.
1049
1050The only prior NFA like behaviour that would be changed by the TRIE support is
1051the silent ignoring of duplicate alternations which are of the form:
1052
1053 / (DUPE|DUPE) X? (?{ ... }) Y /x
1054
1055Thus EVAL blocks follwing a trie may be called a different number of times with
1056and without the optimisation. With the optimisations dupes will be silently
1057ignored. This inconsistant behaviour of EVAL type nodes is well established as
1058the following demonstrates:
1059
1060 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1061
1062which prints out 'word' three times, but
1063
1064 'words'=~/(word|word|word)(?{ print $1 })S/
1065
1066which doesnt print it out at all. This is due to other optimisations kicking in.
1067
1068Example of what happens on a structural level:
1069
1070The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1071
1072 1: CURLYM[1] {1,32767}(18)
1073 5: BRANCH(8)
1074 6: EXACT <ac>(16)
1075 8: BRANCH(11)
1076 9: EXACT <ad>(16)
1077 11: BRANCH(14)
1078 12: EXACT <ab>(16)
1079 16: SUCCEED(0)
1080 17: NOTHING(18)
1081 18: END(0)
1082
1083This would be optimizable with startbranch=5, first=5, last=16, tail=16
1084and should turn into:
1085
1086 1: CURLYM[1] {1,32767}(18)
1087 5: TRIE(16)
1088 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1089 <ac>
1090 <ad>
1091 <ab>
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096Cases where tail != last would be like /(?foo|bar)baz/:
1097
1098 1: BRANCH(4)
1099 2: EXACT <foo>(8)
1100 4: BRANCH(7)
1101 5: EXACT <bar>(8)
1102 7: TAIL(8)
1103 8: EXACT <baz>(10)
1104 10: END(0)
1105
1106which would be optimizable with startbranch=1, first=1, last=7, tail=8
1107and would end up looking like:
1108
1109 1: TRIE(8)
1110 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1111 <foo>
1112 <bar>
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117 d = uvuni_to_utf8_flags(d, uv, 0);
1118
1119is the recommended Unicode-aware way of saying
1120
1121 *(d++) = uv;
1122*/
1123
1e2e3d02 1124#define TRIE_STORE_REVCHAR \
786e8c11 1125 STMT_START { \
1e2e3d02 1126 SV *tmp = newSVpvs(""); \
786e8c11 1127 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1128 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1129 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1130 } STMT_END
1131
1132#define TRIE_READ_CHAR STMT_START { \
1133 wordlen++; \
1134 if ( UTF ) { \
1135 if ( folder ) { \
1136 if ( foldlen > 0 ) { \
1137 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1138 foldlen -= len; \
1139 scan += len; \
1140 len = 0; \
1141 } else { \
1142 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1143 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1144 foldlen -= UNISKIP( uvc ); \
1145 scan = foldbuf + UNISKIP( uvc ); \
1146 } \
1147 } else { \
1148 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1149 } \
1150 } else { \
1151 uvc = (U32)*uc; \
1152 len = 1; \
1153 } \
1154} STMT_END
1155
1156
1157
1158#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1159 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1160 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1161 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1162 } \
1163 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1164 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1165 TRIE_LIST_CUR( state )++; \
1166} STMT_END
07be1b83 1167
786e8c11
YO
1168#define TRIE_LIST_NEW(state) STMT_START { \
1169 Newxz( trie->states[ state ].trans.list, \
1170 4, reg_trie_trans_le ); \
1171 TRIE_LIST_CUR( state ) = 1; \
1172 TRIE_LIST_LEN( state ) = 4; \
1173} STMT_END
07be1b83 1174
786e8c11
YO
1175#define TRIE_HANDLE_WORD(state) STMT_START { \
1176 U16 dupe= trie->states[ state ].wordnum; \
1177 regnode * const noper_next = regnext( noper ); \
1178 \
1179 if (trie->wordlen) \
1180 trie->wordlen[ curword ] = wordlen; \
1181 DEBUG_r({ \
1182 /* store the word for dumping */ \
1183 SV* tmp; \
1184 if (OP(noper) != NOTHING) \
1185 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1186 else \
1187 tmp = newSVpvn( "", 0 ); \
1188 if ( UTF ) SvUTF8_on( tmp ); \
1189 av_push( trie->words, tmp ); \
1190 }); \
1191 \
1192 curword++; \
1193 \
1194 if ( noper_next < tail ) { \
1195 if (!trie->jump) \
1196 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1197 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1198 if (!jumper) \
1199 jumper = noper_next; \
1200 if (!nextbranch) \
1201 nextbranch= regnext(cur); \
1202 } \
1203 \
1204 if ( dupe ) { \
1205 /* So it's a dupe. This means we need to maintain a */\
1206 /* linked-list from the first to the next. */\
1207 /* we only allocate the nextword buffer when there */\
1208 /* a dupe, so first time we have to do the allocation */\
1209 if (!trie->nextword) \
1210 Newxz( trie->nextword, word_count + 1, U16); \
1211 while ( trie->nextword[dupe] ) \
1212 dupe= trie->nextword[dupe]; \
1213 trie->nextword[dupe]= curword; \
1214 } else { \
1215 /* we haven't inserted this word yet. */ \
1216 trie->states[ state ].wordnum = curword; \
1217 } \
1218} STMT_END
07be1b83 1219
3dab1dad 1220
786e8c11
YO
1221#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1222 ( ( base + charid >= ucharcount \
1223 && base + charid < ubound \
1224 && state == trie->trans[ base - ucharcount + charid ].check \
1225 && trie->trans[ base - ucharcount + charid ].next ) \
1226 ? trie->trans[ base - ucharcount + charid ].next \
1227 : ( state==1 ? special : 0 ) \
1228 )
3dab1dad 1229
786e8c11
YO
1230#define MADE_TRIE 1
1231#define MADE_JUMP_TRIE 2
1232#define MADE_EXACT_TRIE 4
3dab1dad 1233
a3621e74 1234STATIC I32
786e8c11 1235S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1236{
27da23d5 1237 dVAR;
a3621e74
YO
1238 /* first pass, loop through and scan words */
1239 reg_trie_data *trie;
1240 regnode *cur;
9f7f3913 1241 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1242 STRLEN len = 0;
1243 UV uvc = 0;
1244 U16 curword = 0;
1245 U32 next_alloc = 0;
786e8c11
YO
1246 regnode *jumper = NULL;
1247 regnode *nextbranch = NULL;
7f69552c 1248 regnode *convert = NULL;
a3621e74 1249 /* we just use folder as a flag in utf8 */
e1ec3a88 1250 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1251 ? PL_fold
1252 : ( flags == EXACTFL
1253 ? PL_fold_locale
1254 : NULL
1255 )
1256 );
1257
e1ec3a88 1258 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1259 SV *re_trie_maxbuff;
3dab1dad
YO
1260#ifndef DEBUGGING
1261 /* these are only used during construction but are useful during
8e11feef 1262 * debugging so we store them in the struct when debugging.
8e11feef 1263 */
3dab1dad 1264 STRLEN trie_charcount=0;
3dab1dad
YO
1265 AV *trie_revcharmap;
1266#endif
a3621e74 1267 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1268#ifndef DEBUGGING
1269 PERL_UNUSED_ARG(depth);
1270#endif
a3621e74 1271
a02a5408 1272 Newxz( trie, 1, reg_trie_data );
a3621e74 1273 trie->refcount = 1;
3dab1dad 1274 trie->startstate = 1;
786e8c11 1275 trie->wordcount = word_count;
a3621e74 1276 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1277 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1278 if (!(UTF && folder))
1279 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1280 DEBUG_r({
1281 trie->words = newAV();
a3621e74 1282 });
3dab1dad 1283 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1284
0111c4fd 1285 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1286 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1287 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1288 }
3dab1dad
YO
1289 DEBUG_OPTIMISE_r({
1290 PerlIO_printf( Perl_debug_log,
786e8c11 1291 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1292 (int)depth * 2 + 2, "",
1293 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1294 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1295 (int)depth);
3dab1dad 1296 });
7f69552c
YO
1297
1298 /* Find the node we are going to overwrite */
1299 if ( first == startbranch && OP( last ) != BRANCH ) {
1300 /* whole branch chain */
1301 convert = first;
1302 } else {
1303 /* branch sub-chain */
1304 convert = NEXTOPER( first );
1305 }
1306
a3621e74
YO
1307 /* -- First loop and Setup --
1308
1309 We first traverse the branches and scan each word to determine if it
1310 contains widechars, and how many unique chars there are, this is
1311 important as we have to build a table with at least as many columns as we
1312 have unique chars.
1313
1314 We use an array of integers to represent the character codes 0..255
1315 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1316 native representation of the character value as the key and IV's for the
1317 coded index.
1318
1319 *TODO* If we keep track of how many times each character is used we can
1320 remap the columns so that the table compression later on is more
1321 efficient in terms of memory by ensuring most common value is in the
1322 middle and the least common are on the outside. IMO this would be better
1323 than a most to least common mapping as theres a decent chance the most
1324 common letter will share a node with the least common, meaning the node
1325 will not be compressable. With a middle is most common approach the worst
1326 case is when we have the least common nodes twice.
1327
1328 */
1329
a3621e74 1330 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1331 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1332 const U8 *uc = (U8*)STRING( noper );
a28509cc 1333 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1334 STRLEN foldlen = 0;
1335 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1336 const U8 *scan = (U8*)NULL;
07be1b83 1337 U32 wordlen = 0; /* required init */
3dab1dad 1338 STRLEN chars=0;
a3621e74 1339
3dab1dad
YO
1340 if (OP(noper) == NOTHING) {
1341 trie->minlen= 0;
1342 continue;
1343 }
1344 if (trie->bitmap) {
1345 TRIE_BITMAP_SET(trie,*uc);
1346 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1347 }
a3621e74 1348 for ( ; uc < e ; uc += len ) {
3dab1dad 1349 TRIE_CHARCOUNT(trie)++;
a3621e74 1350 TRIE_READ_CHAR;
3dab1dad 1351 chars++;
a3621e74
YO
1352 if ( uvc < 256 ) {
1353 if ( !trie->charmap[ uvc ] ) {
1354 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1355 if ( folder )
1356 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1357 TRIE_STORE_REVCHAR;
a3621e74
YO
1358 }
1359 } else {
1360 SV** svpp;
1361 if ( !trie->widecharmap )
1362 trie->widecharmap = newHV();
1363
1364 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1365
1366 if ( !svpp )
e4584336 1367 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1368
1369 if ( !SvTRUE( *svpp ) ) {
1370 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1371 TRIE_STORE_REVCHAR;
a3621e74
YO
1372 }
1373 }
1374 }
3dab1dad
YO
1375 if( cur == first ) {
1376 trie->minlen=chars;
1377 trie->maxlen=chars;
1378 } else if (chars < trie->minlen) {
1379 trie->minlen=chars;
1380 } else if (chars > trie->maxlen) {
1381 trie->maxlen=chars;
1382 }
1383
a3621e74
YO
1384 } /* end first pass */
1385 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1386 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1387 (int)depth * 2 + 2,"",
85c3142d 1388 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1390 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1391 );
786e8c11 1392 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1393
1394 /*
1395 We now know what we are dealing with in terms of unique chars and
1396 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1397 representation using a flat table will take. If it's over a reasonable
1398 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1399 conservative but potentially much slower representation using an array
1400 of lists.
1401
1402 At the end we convert both representations into the same compressed
1403 form that will be used in regexec.c for matching with. The latter
1404 is a form that cannot be used to construct with but has memory
1405 properties similar to the list form and access properties similar
1406 to the table form making it both suitable for fast searches and
1407 small enough that its feasable to store for the duration of a program.
1408
1409 See the comment in the code where the compressed table is produced
1410 inplace from the flat tabe representation for an explanation of how
1411 the compression works.
1412
1413 */
1414
1415
3dab1dad 1416 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1417 /*
1418 Second Pass -- Array Of Lists Representation
1419
1420 Each state will be represented by a list of charid:state records
1421 (reg_trie_trans_le) the first such element holds the CUR and LEN
1422 points of the allocated array. (See defines above).
1423
1424 We build the initial structure using the lists, and then convert
1425 it into the compressed table form which allows faster lookups
1426 (but cant be modified once converted).
a3621e74
YO
1427 */
1428
a3621e74
YO
1429 STRLEN transcount = 1;
1430
1e2e3d02
YO
1431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1432 "%*sCompiling trie using list compiler\n",
1433 (int)depth * 2 + 2, ""));
1434
3dab1dad 1435 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1436 TRIE_LIST_NEW(1);
1437 next_alloc = 2;
1438
1439 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1440
c445ea15
AL
1441 regnode * const noper = NEXTOPER( cur );
1442 U8 *uc = (U8*)STRING( noper );
1443 const U8 * const e = uc + STR_LEN( noper );
1444 U32 state = 1; /* required init */
1445 U16 charid = 0; /* sanity init */
1446 U8 *scan = (U8*)NULL; /* sanity init */
1447 STRLEN foldlen = 0; /* required init */
07be1b83 1448 U32 wordlen = 0; /* required init */
c445ea15
AL
1449 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1450
3dab1dad 1451 if (OP(noper) != NOTHING) {
786e8c11 1452 for ( ; uc < e ; uc += len ) {
c445ea15 1453
786e8c11 1454 TRIE_READ_CHAR;
c445ea15 1455
786e8c11
YO
1456 if ( uvc < 256 ) {
1457 charid = trie->charmap[ uvc ];
c445ea15 1458 } else {
786e8c11
YO
1459 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1460 if ( !svpp ) {
1461 charid = 0;
1462 } else {
1463 charid=(U16)SvIV( *svpp );
1464 }
c445ea15 1465 }
786e8c11
YO
1466 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1467 if ( charid ) {
a3621e74 1468
786e8c11
YO
1469 U16 check;
1470 U32 newstate = 0;
a3621e74 1471
786e8c11
YO
1472 charid--;
1473 if ( !trie->states[ state ].trans.list ) {
1474 TRIE_LIST_NEW( state );
c445ea15 1475 }
786e8c11
YO
1476 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1477 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1478 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1479 break;
1480 }
1481 }
1482 if ( ! newstate ) {
1483 newstate = next_alloc++;
1484 TRIE_LIST_PUSH( state, charid, newstate );
1485 transcount++;
1486 }
1487 state = newstate;
1488 } else {
1489 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1490 }
a28509cc 1491 }
c445ea15 1492 }
3dab1dad 1493 TRIE_HANDLE_WORD(state);
a3621e74
YO
1494
1495 } /* end second pass */
1496
1e2e3d02
YO
1497 /* next alloc is the NEXT state to be allocated */
1498 trie->statecount = next_alloc;
a3621e74
YO
1499 Renew( trie->states, next_alloc, reg_trie_state );
1500
3dab1dad
YO
1501 /* and now dump it out before we compress it */
1502 DEBUG_TRIE_COMPILE_MORE_r(
1503 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1504 );
a3621e74 1505
a02a5408 1506 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1507 {
1508 U32 state;
a3621e74
YO
1509 U32 tp = 0;
1510 U32 zp = 0;
1511
1512
1513 for( state=1 ; state < next_alloc ; state ++ ) {
1514 U32 base=0;
1515
1516 /*
1517 DEBUG_TRIE_COMPILE_MORE_r(
1518 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1519 );
1520 */
1521
1522 if (trie->states[state].trans.list) {
1523 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1524 U16 maxid=minid;
a28509cc 1525 U16 idx;
a3621e74
YO
1526
1527 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1528 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1529 if ( forid < minid ) {
1530 minid=forid;
1531 } else if ( forid > maxid ) {
1532 maxid=forid;
1533 }
a3621e74
YO
1534 }
1535 if ( transcount < tp + maxid - minid + 1) {
1536 transcount *= 2;
1537 Renew( trie->trans, transcount, reg_trie_trans );
1538 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1539 }
1540 base = trie->uniquecharcount + tp - minid;
1541 if ( maxid == minid ) {
1542 U32 set = 0;
1543 for ( ; zp < tp ; zp++ ) {
1544 if ( ! trie->trans[ zp ].next ) {
1545 base = trie->uniquecharcount + zp - minid;
1546 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1547 trie->trans[ zp ].check = state;
1548 set = 1;
1549 break;
1550 }
1551 }
1552 if ( !set ) {
1553 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1554 trie->trans[ tp ].check = state;
1555 tp++;
1556 zp = tp;
1557 }
1558 } else {
1559 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1560 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1561 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1562 trie->trans[ tid ].check = state;
1563 }
1564 tp += ( maxid - minid + 1 );
1565 }
1566 Safefree(trie->states[ state ].trans.list);
1567 }
1568 /*
1569 DEBUG_TRIE_COMPILE_MORE_r(
1570 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1571 );
1572 */
1573 trie->states[ state ].trans.base=base;
1574 }
cc601c31 1575 trie->lasttrans = tp + 1;
a3621e74
YO
1576 }
1577 } else {
1578 /*
1579 Second Pass -- Flat Table Representation.
1580
1581 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1582 We know that we will need Charcount+1 trans at most to store the data
1583 (one row per char at worst case) So we preallocate both structures
1584 assuming worst case.
1585
1586 We then construct the trie using only the .next slots of the entry
1587 structs.
1588
1589 We use the .check field of the first entry of the node temporarily to
1590 make compression both faster and easier by keeping track of how many non
1591 zero fields are in the node.
1592
1593 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1594 transition.
1595
1596 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1597 number representing the first entry of the node, and state as a
1598 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1599 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1600 are 2 entrys per node. eg:
1601
1602 A B A B
1603 1. 2 4 1. 3 7
1604 2. 0 3 3. 0 5
1605 3. 0 0 5. 0 0
1606 4. 0 0 7. 0 0
1607
1608 The table is internally in the right hand, idx form. However as we also
1609 have to deal with the states array which is indexed by nodenum we have to
1610 use TRIE_NODENUM() to convert.
1611
1612 */
1e2e3d02
YO
1613 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1614 "%*sCompiling trie using table compiler\n",
1615 (int)depth * 2 + 2, ""));
3dab1dad
YO
1616
1617 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1618 reg_trie_trans );
3dab1dad 1619 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1620 next_alloc = trie->uniquecharcount + 1;
1621
3dab1dad 1622
a3621e74
YO
1623 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1624
c445ea15 1625 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1626 const U8 *uc = (U8*)STRING( noper );
1627 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1628
1629 U32 state = 1; /* required init */
1630
1631 U16 charid = 0; /* sanity init */
1632 U32 accept_state = 0; /* sanity init */
1633 U8 *scan = (U8*)NULL; /* sanity init */
1634
1635 STRLEN foldlen = 0; /* required init */
07be1b83 1636 U32 wordlen = 0; /* required init */
a3621e74
YO
1637 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1638
3dab1dad 1639 if ( OP(noper) != NOTHING ) {
786e8c11 1640 for ( ; uc < e ; uc += len ) {
a3621e74 1641
786e8c11 1642 TRIE_READ_CHAR;
a3621e74 1643
786e8c11
YO
1644 if ( uvc < 256 ) {
1645 charid = trie->charmap[ uvc ];
1646 } else {
1647 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1648 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1649 }
786e8c11
YO
1650 if ( charid ) {
1651 charid--;
1652 if ( !trie->trans[ state + charid ].next ) {
1653 trie->trans[ state + charid ].next = next_alloc;
1654 trie->trans[ state ].check++;
1655 next_alloc += trie->uniquecharcount;
1656 }
1657 state = trie->trans[ state + charid ].next;
1658 } else {
1659 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1660 }
1661 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1662 }
a3621e74 1663 }
3dab1dad
YO
1664 accept_state = TRIE_NODENUM( state );
1665 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1666
1667 } /* end second pass */
1668
3dab1dad
YO
1669 /* and now dump it out before we compress it */
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 dump_trie_interim_table(trie,next_alloc,depth+1)
1672 );
a3621e74 1673
a3621e74
YO
1674 {
1675 /*
1676 * Inplace compress the table.*
1677
1678 For sparse data sets the table constructed by the trie algorithm will
1679 be mostly 0/FAIL transitions or to put it another way mostly empty.
1680 (Note that leaf nodes will not contain any transitions.)
1681
1682 This algorithm compresses the tables by eliminating most such
1683 transitions, at the cost of a modest bit of extra work during lookup:
1684
1685 - Each states[] entry contains a .base field which indicates the
1686 index in the state[] array wheres its transition data is stored.
1687
1688 - If .base is 0 there are no valid transitions from that node.
1689
1690 - If .base is nonzero then charid is added to it to find an entry in
1691 the trans array.
1692
1693 -If trans[states[state].base+charid].check!=state then the
1694 transition is taken to be a 0/Fail transition. Thus if there are fail
1695 transitions at the front of the node then the .base offset will point
1696 somewhere inside the previous nodes data (or maybe even into a node
1697 even earlier), but the .check field determines if the transition is
1698 valid.
1699
786e8c11 1700 XXX - wrong maybe?
a3621e74
YO
1701 The following process inplace converts the table to the compressed
1702 table: We first do not compress the root node 1,and mark its all its
1703 .check pointers as 1 and set its .base pointer as 1 as well. This
1704 allows to do a DFA construction from the compressed table later, and
1705 ensures that any .base pointers we calculate later are greater than
1706 0.
1707
1708 - We set 'pos' to indicate the first entry of the second node.
1709
1710 - We then iterate over the columns of the node, finding the first and
1711 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1712 and set the .check pointers accordingly, and advance pos
1713 appropriately and repreat for the next node. Note that when we copy
1714 the next pointers we have to convert them from the original
1715 NODEIDX form to NODENUM form as the former is not valid post
1716 compression.
1717
1718 - If a node has no transitions used we mark its base as 0 and do not
1719 advance the pos pointer.
1720
1721 - If a node only has one transition we use a second pointer into the
1722 structure to fill in allocated fail transitions from other states.
1723 This pointer is independent of the main pointer and scans forward
1724 looking for null transitions that are allocated to a state. When it
1725 finds one it writes the single transition into the "hole". If the
786e8c11 1726 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1727
1728 - Once compressed we can Renew/realloc the structures to release the
1729 excess space.
1730
1731 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1732 specifically Fig 3.47 and the associated pseudocode.
1733
1734 demq
1735 */
a3b680e6 1736 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1737 U32 state, charid;
a3621e74 1738 U32 pos = 0, zp=0;
1e2e3d02 1739 trie->statecount = laststate;
a3621e74
YO
1740
1741 for ( state = 1 ; state < laststate ; state++ ) {
1742 U8 flag = 0;
a28509cc
AL
1743 const U32 stateidx = TRIE_NODEIDX( state );
1744 const U32 o_used = trie->trans[ stateidx ].check;
1745 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1746 trie->trans[ stateidx ].check = 0;
1747
1748 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1749 if ( flag || trie->trans[ stateidx + charid ].next ) {
1750 if ( trie->trans[ stateidx + charid ].next ) {
1751 if (o_used == 1) {
1752 for ( ; zp < pos ; zp++ ) {
1753 if ( ! trie->trans[ zp ].next ) {
1754 break;
1755 }
1756 }
1757 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1758 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1759 trie->trans[ zp ].check = state;
1760 if ( ++zp > pos ) pos = zp;
1761 break;
1762 }
1763 used--;
1764 }
1765 if ( !flag ) {
1766 flag = 1;
1767 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1768 }
1769 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ pos ].check = state;
1771 pos++;
1772 }
1773 }
1774 }
cc601c31 1775 trie->lasttrans = pos + 1;
1e2e3d02 1776 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1777 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1778 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1779 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1780 (int)depth * 2 + 2,"",
1781 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1782 (IV)next_alloc,
1783 (IV)pos,
a3621e74
YO
1784 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1785 );
1786
1787 } /* end table compress */
1788 }
1e2e3d02
YO
1789 DEBUG_TRIE_COMPILE_MORE_r(
1790 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1791 (int)depth * 2 + 2, "",
1792 (UV)trie->statecount,
1793 (UV)trie->lasttrans)
1794 );
cc601c31
YO
1795 /* resize the trans array to remove unused space */
1796 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1797
3dab1dad
YO
1798 /* and now dump out the compressed format */
1799 DEBUG_TRIE_COMPILE_r(
1800 dump_trie(trie,depth+1)
1801 );
07be1b83 1802
3dab1dad 1803 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1804 U8 nodetype =(U8)(flags & 0xFF);
1805 char *str=NULL;
786e8c11 1806
07be1b83 1807#ifdef DEBUGGING
e62cc96a 1808 regnode *optimize = NULL;
b57a0404
JH
1809 U32 mjd_offset = 0;
1810 U32 mjd_nodelen = 0;
07be1b83 1811#endif
a3621e74 1812 /*
3dab1dad
YO
1813 This means we convert either the first branch or the first Exact,
1814 depending on whether the thing following (in 'last') is a branch
1815 or not and whther first is the startbranch (ie is it a sub part of
1816 the alternation or is it the whole thing.)
1817 Assuming its a sub part we conver the EXACT otherwise we convert
1818 the whole branch sequence, including the first.
a3621e74 1819 */
3dab1dad 1820 /* Find the node we are going to overwrite */
7f69552c 1821 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1822 /* branch sub-chain */
3dab1dad 1823 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1824 DEBUG_r({
1825 mjd_offset= Node_Offset((convert));
1826 mjd_nodelen= Node_Length((convert));
1827 });
7f69552c
YO
1828 /* whole branch chain */
1829 } else {
1830 DEBUG_r({
1831 const regnode *nop = NEXTOPER( convert );
1832 mjd_offset= Node_Offset((nop));
1833 mjd_nodelen= Node_Length((nop));
1834 });
07be1b83 1835 }
7f69552c 1836
07be1b83
YO
1837 DEBUG_OPTIMISE_r(
1838 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1839 (int)depth * 2 + 2, "",
786e8c11 1840 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1841 );
a3621e74 1842
3dab1dad
YO
1843 /* But first we check to see if there is a common prefix we can
1844 split out as an EXACT and put in front of the TRIE node. */
1845 trie->startstate= 1;
786e8c11 1846 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1847 U32 state;
1e2e3d02 1848 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1849 U32 ofs = 0;
8e11feef
RGS
1850 I32 idx = -1;
1851 U32 count = 0;
1852 const U32 base = trie->states[ state ].trans.base;
a3621e74 1853
3dab1dad 1854 if ( trie->states[state].wordnum )
8e11feef 1855 count = 1;
a3621e74 1856
8e11feef 1857 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1858 if ( ( base + ofs >= trie->uniquecharcount ) &&
1859 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1860 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1861 {
3dab1dad 1862 if ( ++count > 1 ) {
8e11feef 1863 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1864 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1865 if ( state == 1 ) break;
3dab1dad
YO
1866 if ( count == 2 ) {
1867 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1868 DEBUG_OPTIMISE_r(
8e11feef
RGS
1869 PerlIO_printf(Perl_debug_log,
1870 "%*sNew Start State=%"UVuf" Class: [",
1871 (int)depth * 2 + 2, "",
786e8c11 1872 (UV)state));
be8e71aa
YO
1873 if (idx >= 0) {
1874 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1875 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1876
3dab1dad 1877 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1878 if ( folder )
1879 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1880 DEBUG_OPTIMISE_r(
07be1b83 1881 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1882 );
8e11feef
RGS
1883 }
1884 }
1885 TRIE_BITMAP_SET(trie,*ch);
1886 if ( folder )
1887 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1888 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1889 }
1890 idx = ofs;
1891 }
3dab1dad
YO
1892 }
1893 if ( count == 1 ) {
1894 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1895 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1896 DEBUG_OPTIMISE_r(
8e11feef
RGS
1897 PerlIO_printf( Perl_debug_log,
1898 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1899 (int)depth * 2 + 2, "",
786e8c11 1900 (UV)state, (UV)idx, ch)
3dab1dad
YO
1901 );
1902 if ( state==1 ) {
1903 OP( convert ) = nodetype;
1904 str=STRING(convert);
1905 STR_LEN(convert)=0;
1906 }
1907 *str++=*ch;
1908 STR_LEN(convert)++;
a3621e74 1909
8e11feef 1910 } else {
f9049ba1 1911#ifdef DEBUGGING
8e11feef
RGS
1912 if (state>1)
1913 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1914#endif
8e11feef
RGS
1915 break;
1916 }
1917 }
3dab1dad 1918 if (str) {
8e11feef 1919 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1920 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1921 trie->startstate = state;
07be1b83
YO
1922 trie->minlen -= (state - 1);
1923 trie->maxlen -= (state - 1);
1924 DEBUG_r({
1925 regnode *fix = convert;
1926 mjd_nodelen++;
1927 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1928 while( ++fix < n ) {
1929 Set_Node_Offset_Length(fix, 0, 0);
1930 }
1931 });
8e11feef
RGS
1932 if (trie->maxlen) {
1933 convert = n;
1934 } else {
3dab1dad 1935 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1936 DEBUG_r(optimize= n);
3dab1dad
YO
1937 }
1938 }
1939 }
a5ca303d
YO
1940 if (!jumper)
1941 jumper = last;
3dab1dad 1942 if ( trie->maxlen ) {
8e11feef
RGS
1943 NEXT_OFF( convert ) = (U16)(tail - convert);
1944 ARG_SET( convert, data_slot );
786e8c11
YO
1945 /* Store the offset to the first unabsorbed branch in
1946 jump[0], which is otherwise unused by the jump logic.
1947 We use this when dumping a trie and during optimisation. */
1948 if (trie->jump)
7f69552c 1949 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1950
786e8c11
YO
1951 /* XXXX */
1952 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1953 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1954 {
1955 OP( convert ) = TRIEC;
1956 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1957 Safefree(trie->bitmap);
1958 trie->bitmap= NULL;
1959 } else
1960 OP( convert ) = TRIE;
a3621e74 1961
3dab1dad
YO
1962 /* store the type in the flags */
1963 convert->flags = nodetype;
a5ca303d
YO
1964 DEBUG_r({
1965 optimize = convert
1966 + NODE_STEP_REGNODE
1967 + regarglen[ OP( convert ) ];
1968 });
1969 /* XXX We really should free up the resource in trie now,
1970 as we won't use them - (which resources?) dmq */
3dab1dad 1971 }
a3621e74 1972 /* needed for dumping*/
e62cc96a 1973 DEBUG_r(if (optimize) {
07be1b83 1974 regnode *opt = convert;
e62cc96a 1975 while ( ++opt < optimize) {
07be1b83
YO
1976 Set_Node_Offset_Length(opt,0,0);
1977 }
786e8c11
YO
1978 /*
1979 Try to clean up some of the debris left after the
1980 optimisation.
a3621e74 1981 */
786e8c11 1982 while( optimize < jumper ) {
07be1b83 1983 mjd_nodelen += Node_Length((optimize));
a3621e74 1984 OP( optimize ) = OPTIMIZED;
07be1b83 1985 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1986 optimize++;
1987 }
07be1b83 1988 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1989 });
1990 } /* end node insert */
07be1b83 1991#ifndef DEBUGGING
6e8b4190 1992 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1993#endif
786e8c11
YO
1994 return trie->jump
1995 ? MADE_JUMP_TRIE
1996 : trie->startstate>1
1997 ? MADE_EXACT_TRIE
1998 : MADE_TRIE;
1999}
2000
2001STATIC void
2002S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2003{
2004/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2005
2006 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2007 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2008 ISBN 0-201-10088-6
2009
2010 We find the fail state for each state in the trie, this state is the longest proper
2011 suffix of the current states 'word' that is also a proper prefix of another word in our
2012 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2013 the DFA not to have to restart after its tried and failed a word at a given point, it
2014 simply continues as though it had been matching the other word in the first place.
2015 Consider
2016 'abcdgu'=~/abcdefg|cdgu/
2017 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2018 fail, which would bring use to the state representing 'd' in the second word where we would
2019 try 'g' and succeed, prodceding to match 'cdgu'.
2020 */
2021 /* add a fail transition */
2022 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2023 U32 *q;
2024 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2025 const U32 numstates = trie->statecount;
786e8c11
YO
2026 const U32 ubound = trie->lasttrans + ucharcount;
2027 U32 q_read = 0;
2028 U32 q_write = 0;
2029 U32 charid;
2030 U32 base = trie->states[ 1 ].trans.base;
2031 U32 *fail;
2032 reg_ac_data *aho;
2033 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2034 GET_RE_DEBUG_FLAGS_DECL;
2035#ifndef DEBUGGING
2036 PERL_UNUSED_ARG(depth);
2037#endif
2038
2039
2040 ARG_SET( stclass, data_slot );
2041 Newxz( aho, 1, reg_ac_data );
2042 RExC_rx->data->data[ data_slot ] = (void*)aho;
2043 aho->trie=trie;
2044 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2045 numstates * sizeof(reg_trie_state));
786e8c11
YO
2046 Newxz( q, numstates, U32);
2047 Newxz( aho->fail, numstates, U32 );
2048 aho->refcount = 1;
2049 fail = aho->fail;
2050 /* initialize fail[0..1] to be 1 so that we always have
2051 a valid final fail state */
2052 fail[ 0 ] = fail[ 1 ] = 1;
2053
2054 for ( charid = 0; charid < ucharcount ; charid++ ) {
2055 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2056 if ( newstate ) {
2057 q[ q_write ] = newstate;
2058 /* set to point at the root */
2059 fail[ q[ q_write++ ] ]=1;
2060 }
2061 }
2062 while ( q_read < q_write) {
2063 const U32 cur = q[ q_read++ % numstates ];
2064 base = trie->states[ cur ].trans.base;
2065
2066 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2067 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2068 if (ch_state) {
2069 U32 fail_state = cur;
2070 U32 fail_base;
2071 do {
2072 fail_state = fail[ fail_state ];
2073 fail_base = aho->states[ fail_state ].trans.base;
2074 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2075
2076 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2077 fail[ ch_state ] = fail_state;
2078 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2079 {
2080 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2081 }
2082 q[ q_write++ % numstates] = ch_state;
2083 }
2084 }
2085 }
2086 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2087 when we fail in state 1, this allows us to use the
2088 charclass scan to find a valid start char. This is based on the principle
2089 that theres a good chance the string being searched contains lots of stuff
2090 that cant be a start char.
2091 */
2092 fail[ 0 ] = fail[ 1 ] = 0;
2093 DEBUG_TRIE_COMPILE_r({
1e2e3d02
YO
2094 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2095 (int)(depth * 2), "", numstates
2096 );
786e8c11
YO
2097 for( q_read=1; q_read<numstates; q_read++ ) {
2098 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2099 }
2100 PerlIO_printf(Perl_debug_log, "\n");
2101 });
2102 Safefree(q);
2103 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2104}
2105
786e8c11 2106
a3621e74 2107/*
5d1c421c
JH
2108 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2109 * These need to be revisited when a newer toolchain becomes available.
2110 */
2111#if defined(__sparc64__) && defined(__GNUC__)
2112# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2113# undef SPARC64_GCC_WORKAROUND
2114# define SPARC64_GCC_WORKAROUND 1
2115# endif
2116#endif
2117
07be1b83
YO
2118#define DEBUG_PEEP(str,scan,depth) \
2119 DEBUG_OPTIMISE_r({ \
2120 SV * const mysv=sv_newmortal(); \
2121 regnode *Next = regnext(scan); \
2122 regprop(RExC_rx, mysv, scan); \
7f69552c 2123 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2124 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2125 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2126 });
2127
1de06328
YO
2128
2129
2130
2131
07be1b83
YO
2132#define JOIN_EXACT(scan,min,flags) \
2133 if (PL_regkind[OP(scan)] == EXACT) \
2134 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2135
be8e71aa 2136STATIC U32
07be1b83
YO
2137S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2138 /* Merge several consecutive EXACTish nodes into one. */
2139 regnode *n = regnext(scan);
2140 U32 stringok = 1;
2141 regnode *next = scan + NODE_SZ_STR(scan);
2142 U32 merged = 0;
2143 U32 stopnow = 0;
2144#ifdef DEBUGGING
2145 regnode *stop = scan;
72f13be8 2146 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2147#else
d47053eb
RGS
2148 PERL_UNUSED_ARG(depth);
2149#endif
2150#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2151 PERL_UNUSED_ARG(flags);
2152 PERL_UNUSED_ARG(val);
07be1b83 2153#endif
07be1b83
YO
2154 DEBUG_PEEP("join",scan,depth);
2155
2156 /* Skip NOTHING, merge EXACT*. */
2157 while (n &&
2158 ( PL_regkind[OP(n)] == NOTHING ||
2159 (stringok && (OP(n) == OP(scan))))
2160 && NEXT_OFF(n)
2161 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2162
2163 if (OP(n) == TAIL || n > next)
2164 stringok = 0;
2165 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2166 DEBUG_PEEP("skip:",n,depth);
2167 NEXT_OFF(scan) += NEXT_OFF(n);
2168 next = n + NODE_STEP_REGNODE;
2169#ifdef DEBUGGING
2170 if (stringok)
2171 stop = n;
2172#endif
2173 n = regnext(n);
2174 }
2175 else if (stringok) {
786e8c11 2176 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2177 regnode * const nnext = regnext(n);
2178
2179 DEBUG_PEEP("merg",n,depth);
2180
2181 merged++;
2182 if (oldl + STR_LEN(n) > U8_MAX)
2183 break;
2184 NEXT_OFF(scan) += NEXT_OFF(n);
2185 STR_LEN(scan) += STR_LEN(n);
2186 next = n + NODE_SZ_STR(n);
2187 /* Now we can overwrite *n : */
2188 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2189#ifdef DEBUGGING
2190 stop = next - 1;
2191#endif
2192 n = nnext;
2193 if (stopnow) break;
2194 }
2195
d47053eb
RGS
2196#ifdef EXPERIMENTAL_INPLACESCAN
2197 if (flags && !NEXT_OFF(n)) {
2198 DEBUG_PEEP("atch", val, depth);
2199 if (reg_off_by_arg[OP(n)]) {
2200 ARG_SET(n, val - n);
2201 }
2202 else {
2203 NEXT_OFF(n) = val - n;
2204 }
2205 stopnow = 1;
2206 }
07be1b83
YO
2207#endif
2208 }
2209
2210 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2211 /*
2212 Two problematic code points in Unicode casefolding of EXACT nodes:
2213
2214 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2215 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2216
2217 which casefold to
2218
2219 Unicode UTF-8
2220
2221 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2222 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2223
2224 This means that in case-insensitive matching (or "loose matching",
2225 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2226 length of the above casefolded versions) can match a target string
2227 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2228 This would rather mess up the minimum length computation.
2229
2230 What we'll do is to look for the tail four bytes, and then peek
2231 at the preceding two bytes to see whether we need to decrease
2232 the minimum length by four (six minus two).
2233
2234 Thanks to the design of UTF-8, there cannot be false matches:
2235 A sequence of valid UTF-8 bytes cannot be a subsequence of
2236 another valid sequence of UTF-8 bytes.
2237
2238 */
2239 char * const s0 = STRING(scan), *s, *t;
2240 char * const s1 = s0 + STR_LEN(scan) - 1;
2241 char * const s2 = s1 - 4;
e294cc5d
JH
2242#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2243 const char t0[] = "\xaf\x49\xaf\x42";
2244#else
07be1b83 2245 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2246#endif
07be1b83
YO
2247 const char * const t1 = t0 + 3;
2248
2249 for (s = s0 + 2;
2250 s < s2 && (t = ninstr(s, s1, t0, t1));
2251 s = t + 4) {
e294cc5d
JH
2252#ifdef EBCDIC
2253 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2254 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2255#else
07be1b83
YO
2256 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2257 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2258#endif
07be1b83
YO
2259 *min -= 4;
2260 }
2261 }
2262
2263#ifdef DEBUGGING
2264 /* Allow dumping */
2265 n = scan + NODE_SZ_STR(scan);
2266 while (n <= stop) {
2267 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2268 OP(n) = OPTIMIZED;
2269 NEXT_OFF(n) = 0;
2270 }
2271 n++;
2272 }
2273#endif
2274 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2275 return stopnow;
2276}
2277
653099ff
GS
2278/* REx optimizer. Converts nodes into quickier variants "in place".
2279 Finds fixed substrings. */
2280
a0288114 2281/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2282 to the position after last scanned or to NULL. */
2283
40d049e4
YO
2284#define INIT_AND_WITHP \
2285 assert(!and_withp); \
2286 Newx(and_withp,1,struct regnode_charclass_class); \
2287 SAVEFREEPV(and_withp)
07be1b83 2288
76e3520e 2289STATIC I32
40d049e4 2290S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2291 I32 *minlenp, I32 *deltap,
40d049e4
YO
2292 regnode *last,
2293 scan_data_t *data,
2294 I32 stopparen,
2295 U8* recursed,
2296 struct regnode_charclass_class *and_withp,
2297 U32 flags, U32 depth)
c277df42
IZ
2298 /* scanp: Start here (read-write). */
2299 /* deltap: Write maxlen-minlen here. */
2300 /* last: Stop before this one. */
40d049e4
YO
2301 /* data: string data about the pattern */
2302 /* stopparen: treat close N as END */
2303 /* recursed: which subroutines have we recursed into */
2304 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2305{
97aff369 2306 dVAR;
c277df42
IZ
2307 I32 min = 0, pars = 0, code;
2308 regnode *scan = *scanp, *next;
2309 I32 delta = 0;
2310 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2311 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2312 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2313 scan_data_t data_fake;
a3621e74 2314 SV *re_trie_maxbuff = NULL;
786e8c11 2315 regnode *first_non_open = scan;
e2e6a0f1 2316 I32 stopmin = I32_MAX;
a3621e74 2317 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2318#ifdef DEBUGGING
40d049e4 2319 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2320#endif
40d049e4 2321
786e8c11 2322 if ( depth == 0 ) {
40d049e4 2323 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2324 first_non_open=regnext(first_non_open);
2325 }
2326
b81d288d 2327
c277df42
IZ
2328 while (scan && OP(scan) != END && scan < last) {
2329 /* Peephole optimizer: */
1de06328 2330 DEBUG_STUDYDATA(data,depth);
07be1b83 2331 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2332 JOIN_EXACT(scan,&min,0);
a3621e74 2333
653099ff
GS
2334 /* Follow the next-chain of the current node and optimize
2335 away all the NOTHINGs from it. */
c277df42 2336 if (OP(scan) != CURLYX) {
a3b680e6 2337 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2338 ? I32_MAX
2339 /* I32 may be smaller than U16 on CRAYs! */
2340 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2341 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2342 int noff;
2343 regnode *n = scan;
b81d288d 2344
c277df42
IZ
2345 /* Skip NOTHING and LONGJMP. */
2346 while ((n = regnext(n))
3dab1dad 2347 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2348 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2349 && off + noff < max)
2350 off += noff;
2351 if (reg_off_by_arg[OP(scan)])
2352 ARG(scan) = off;
b81d288d 2353 else
c277df42
IZ
2354 NEXT_OFF(scan) = off;
2355 }
a3621e74 2356
07be1b83 2357
3dab1dad 2358
653099ff
GS
2359 /* The principal pseudo-switch. Cannot be a switch, since we
2360 look into several different things. */
b81d288d 2361 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2362 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2363 next = regnext(scan);
2364 code = OP(scan);
a3621e74 2365 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2366
2367 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2368 /* NOTE - There is similar code to this block below for handling
2369 TRIE nodes on a re-study. If you change stuff here check there
2370 too. */
c277df42 2371 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2372 struct regnode_charclass_class accum;
d4c19fe8 2373 regnode * const startbranch=scan;
c277df42 2374
653099ff 2375 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2376 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2377 if (flags & SCF_DO_STCLASS)
830247a4 2378 cl_init_zero(pRExC_state, &accum);
a3621e74 2379
c277df42 2380 while (OP(scan) == code) {
830247a4 2381 I32 deltanext, minnext, f = 0, fake;
653099ff 2382 struct regnode_charclass_class this_class;
c277df42
IZ
2383
2384 num++;
2385 data_fake.flags = 0;
40d049e4 2386 if (data) {
2c2d71f5 2387 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2388 data_fake.last_closep = data->last_closep;
2389 }
2390 else
2391 data_fake.last_closep = &fake;
c277df42
IZ
2392 next = regnext(scan);
2393 scan = NEXTOPER(scan);
2394 if (code != BRANCH)
2395 scan = NEXTOPER(scan);
653099ff 2396 if (flags & SCF_DO_STCLASS) {
830247a4 2397 cl_init(pRExC_state, &this_class);
653099ff
GS
2398 data_fake.start_class = &this_class;
2399 f = SCF_DO_STCLASS_AND;
b81d288d 2400 }
e1901655
IZ
2401 if (flags & SCF_WHILEM_VISITED_POS)
2402 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2403
653099ff 2404 /* we suppose the run is continuous, last=next...*/
1de06328 2405 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2406 next, &data_fake,
2407 stopparen, recursed, NULL, f,depth+1);
b81d288d 2408 if (min1 > minnext)
c277df42
IZ
2409 min1 = minnext;
2410 if (max1 < minnext + deltanext)
2411 max1 = minnext + deltanext;
2412 if (deltanext == I32_MAX)
aca2d497 2413 is_inf = is_inf_internal = 1;
c277df42
IZ
2414 scan = next;
2415 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2416 pars++;
e2e6a0f1
YO
2417 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2418 if ( stopmin > minnext)
2419 stopmin = min + min1;
2420 flags &= ~SCF_DO_SUBSTR;
2421 if (data)
2422 data->flags |= SCF_SEEN_ACCEPT;
2423 }
3dab1dad
YO
2424 if (data) {
2425 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2426 data->flags |= SF_HAS_EVAL;
2c2d71f5 2427 data->whilem_c = data_fake.whilem_c;
3dab1dad 2428 }
653099ff 2429 if (flags & SCF_DO_STCLASS)
830247a4 2430 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2431 if (code == SUSPEND)
c277df42
IZ
2432 break;
2433 }
2434 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2435 min1 = 0;
2436 if (flags & SCF_DO_SUBSTR) {
2437 data->pos_min += min1;
2438 data->pos_delta += max1 - min1;
2439 if (max1 != min1 || is_inf)
2440 data->longest = &(data->longest_float);
2441 }
2442 min += min1;
2443 delta += max1 - min1;
653099ff 2444 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2445 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2446 if (min1) {
40d049e4 2447 cl_and(data->start_class, and_withp);
653099ff
GS
2448 flags &= ~SCF_DO_STCLASS;
2449 }
2450 }
2451 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2452 if (min1) {
2453 cl_and(data->start_class, &accum);
653099ff 2454 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2455 }
2456 else {
b81d288d 2457 /* Switch to OR mode: cache the old value of
de0c8cb8 2458 * data->start_class */
40d049e4
YO
2459 INIT_AND_WITHP;
2460 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2461 struct regnode_charclass_class);
2462 flags &= ~SCF_DO_STCLASS_AND;
2463 StructCopy(&accum, data->start_class,
2464 struct regnode_charclass_class);
2465 flags |= SCF_DO_STCLASS_OR;
2466 data->start_class->flags |= ANYOF_EOS;
2467 }
653099ff 2468 }
a3621e74 2469
786e8c11 2470 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2471 /* demq.
2472
2473 Assuming this was/is a branch we are dealing with: 'scan' now
2474 points at the item that follows the branch sequence, whatever
2475 it is. We now start at the beginning of the sequence and look
2476 for subsequences of
2477
786e8c11
YO
2478 BRANCH->EXACT=>x1
2479 BRANCH->EXACT=>x2
2480 tail
a3621e74
YO
2481
2482 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2483
2484 If we can find such a subseqence we need to turn the first
2485 element into a trie and then add the subsequent branch exact
2486 strings to the trie.
2487
2488 We have two cases
2489
786e8c11 2490 1. patterns where the whole set of branch can be converted.
a3621e74 2491
786e8c11 2492 2. patterns where only a subset can be converted.
a3621e74
YO
2493
2494 In case 1 we can replace the whole set with a single regop
2495 for the trie. In case 2 we need to keep the start and end
2496 branchs so
2497
2498 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2499 becomes BRANCH TRIE; BRANCH X;
2500
786e8c11
YO
2501 There is an additional case, that being where there is a
2502 common prefix, which gets split out into an EXACT like node
2503 preceding the TRIE node.
2504
2505 If x(1..n)==tail then we can do a simple trie, if not we make
2506 a "jump" trie, such that when we match the appropriate word
2507 we "jump" to the appopriate tail node. Essentailly we turn
2508 a nested if into a case structure of sorts.
a3621e74
YO
2509
2510 */
786e8c11 2511
3dab1dad 2512 int made=0;
0111c4fd
RGS
2513 if (!re_trie_maxbuff) {
2514 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2515 if (!SvIOK(re_trie_maxbuff))
2516 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2517 }
786e8c11 2518 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2519 regnode *cur;
2520 regnode *first = (regnode *)NULL;
2521 regnode *last = (regnode *)NULL;
2522 regnode *tail = scan;
2523 U8 optype = 0;
2524 U32 count=0;
2525
2526#ifdef DEBUGGING
c445ea15 2527 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2528#endif
2529 /* var tail is used because there may be a TAIL
2530 regop in the way. Ie, the exacts will point to the
2531 thing following the TAIL, but the last branch will
2532 point at the TAIL. So we advance tail. If we
2533 have nested (?:) we may have to move through several
2534 tails.
2535 */
2536
2537 while ( OP( tail ) == TAIL ) {
2538 /* this is the TAIL generated by (?:) */
2539 tail = regnext( tail );
2540 }
2541
3dab1dad 2542
a3621e74 2543 DEBUG_OPTIMISE_r({
32fc9b6a 2544 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2545 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2546 (int)depth * 2 + 2, "",
2547 "Looking for TRIE'able sequences. Tail node is: ",
2548 SvPV_nolen_const( mysv )
a3621e74
YO
2549 );
2550 });
3dab1dad 2551
a3621e74
YO
2552 /*
2553
2554 step through the branches, cur represents each
2555 branch, noper is the first thing to be matched
2556 as part of that branch and noper_next is the
2557 regnext() of that node. if noper is an EXACT
2558 and noper_next is the same as scan (our current
2559 position in the regex) then the EXACT branch is
2560 a possible optimization target. Once we have
2561 two or more consequetive such branches we can
2562 create a trie of the EXACT's contents and stich
2563 it in place. If the sequence represents all of
2564 the branches we eliminate the whole thing and
2565 replace it with a single TRIE. If it is a
2566 subsequence then we need to stitch it in. This
2567 means the first branch has to remain, and needs
2568 to be repointed at the item on the branch chain
2569 following the last branch optimized. This could
2570 be either a BRANCH, in which case the
2571 subsequence is internal, or it could be the
2572 item following the branch sequence in which
2573 case the subsequence is at the end.
2574
2575 */
2576
2577 /* dont use tail as the end marker for this traverse */
2578 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2579 regnode * const noper = NEXTOPER( cur );
be981c67 2580#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2581 regnode * const noper_next = regnext( noper );
be981c67 2582#endif
a3621e74 2583
a3621e74 2584 DEBUG_OPTIMISE_r({
32fc9b6a 2585 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2586 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2587 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2588
32fc9b6a 2589 regprop(RExC_rx, mysv, noper);
a3621e74 2590 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2591 SvPV_nolen_const(mysv));
a3621e74
YO
2592
2593 if ( noper_next ) {
32fc9b6a 2594 regprop(RExC_rx, mysv, noper_next );
a3621e74 2595 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2596 SvPV_nolen_const(mysv));
a3621e74 2597 }
3dab1dad
YO
2598 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2599 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2600 });
3dab1dad
YO
2601 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2602 : PL_regkind[ OP( noper ) ] == EXACT )
2603 || OP(noper) == NOTHING )
786e8c11
YO
2604#ifdef NOJUMPTRIE
2605 && noper_next == tail
2606#endif
2607 && count < U16_MAX)
a3621e74
YO
2608 {
2609 count++;
3dab1dad
YO
2610 if ( !first || optype == NOTHING ) {
2611 if (!first) first = cur;
a3621e74
YO
2612 optype = OP( noper );
2613 } else {
a3621e74 2614 last = cur;
a3621e74
YO
2615 }
2616 } else {
2617 if ( last ) {
786e8c11
YO
2618 make_trie( pRExC_state,
2619 startbranch, first, cur, tail, count,
2620 optype, depth+1 );
a3621e74 2621 }
3dab1dad 2622 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2623#ifdef NOJUMPTRIE
2624 && noper_next == tail
2625#endif
2626 ){
a3621e74
YO
2627 count = 1;
2628 first = cur;
2629 optype = OP( noper );
2630 } else {
2631 count = 0;
2632 first = NULL;
2633 optype = 0;
2634 }
2635 last = NULL;
2636 }
2637 }
2638 DEBUG_OPTIMISE_r({
32fc9b6a 2639 regprop(RExC_rx, mysv, cur);
a3621e74 2640 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2641 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2642 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2643
2644 });
2645 if ( last ) {
786e8c11 2646 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2647#ifdef TRIE_STUDY_OPT
786e8c11
YO
2648 if ( ((made == MADE_EXACT_TRIE &&
2649 startbranch == first)
2650 || ( first_non_open == first )) &&
2651 depth==0 )
2652 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2653#endif
07be1b83 2654 }
a3621e74 2655 }
3dab1dad
YO
2656
2657 } /* do trie */
786e8c11 2658
a0ed51b3 2659 }
a3621e74 2660 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2661 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2662 } else /* single branch is optimized. */
c277df42
IZ
2663 scan = NEXTOPER(scan);
2664 continue;
a0ed51b3
LW
2665 }
2666 else if (OP(scan) == EXACT) {
cd439c50 2667 I32 l = STR_LEN(scan);
c445ea15 2668 UV uc;
a0ed51b3 2669 if (UTF) {
a3b680e6 2670 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2671 l = utf8_length(s, s + l);
9041c2e3 2672 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2673 } else {
2674 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2675 }
2676 min += l;
c277df42 2677 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2678 /* The code below prefers earlier match for fixed
2679 offset, later match for variable offset. */
2680 if (data->last_end == -1) { /* Update the start info. */
2681 data->last_start_min = data->pos_min;
2682 data->last_start_max = is_inf
b81d288d 2683 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2684 }
cd439c50 2685 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2686 if (UTF)
2687 SvUTF8_on(data->last_found);
0eda9292 2688 {
9a957fbc 2689 SV * const sv = data->last_found;
a28509cc 2690 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2691 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2692 if (mg && mg->mg_len >= 0)
5e43f467
JH
2693 mg->mg_len += utf8_length((U8*)STRING(scan),
2694 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2695 }
c277df42
IZ
2696 data->last_end = data->pos_min + l;
2697 data->pos_min += l; /* As in the first entry. */
2698 data->flags &= ~SF_BEFORE_EOL;
2699 }
653099ff
GS
2700 if (flags & SCF_DO_STCLASS_AND) {
2701 /* Check whether it is compatible with what we know already! */
2702 int compat = 1;
2703
1aa99e6b 2704 if (uc >= 0x100 ||
516a5887 2705 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2706 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2707 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2708 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2709 )
653099ff
GS
2710 compat = 0;
2711 ANYOF_CLASS_ZERO(data->start_class);
2712 ANYOF_BITMAP_ZERO(data->start_class);
2713 if (compat)
1aa99e6b 2714 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2715 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2716 if (uc < 0x100)
2717 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2718 }
2719 else if (flags & SCF_DO_STCLASS_OR) {
2720 /* false positive possible if the class is case-folded */
1aa99e6b 2721 if (uc < 0x100)
9b877dbb
IH
2722 ANYOF_BITMAP_SET(data->start_class, uc);
2723 else
2724 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2725 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2726 cl_and(data->start_class, and_withp);
653099ff
GS
2727 }
2728 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2729 }
3dab1dad 2730 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2731 I32 l = STR_LEN(scan);
1aa99e6b 2732 UV uc = *((U8*)STRING(scan));
653099ff
GS
2733
2734 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2735 if (flags & SCF_DO_SUBSTR) {
2736 assert(data);
1de06328 2737 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2738 }
a0ed51b3 2739 if (UTF) {
6136c704 2740 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2741 l = utf8_length(s, s + l);
9041c2e3 2742 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2743 }
2744 min += l;
ecaa9b9c 2745 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2746 data->pos_min += l;
653099ff
GS
2747 if (flags & SCF_DO_STCLASS_AND) {
2748 /* Check whether it is compatible with what we know already! */
2749 int compat = 1;
2750
1aa99e6b 2751 if (uc >= 0x100 ||
516a5887 2752 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2753 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2754 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2755 compat = 0;
2756 ANYOF_CLASS_ZERO(data->start_class);
2757 ANYOF_BITMAP_ZERO(data->start_class);
2758 if (compat) {
1aa99e6b 2759 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2760 data->start_class->flags &= ~ANYOF_EOS;
2761 data->start_class->flags |= ANYOF_FOLD;
2762 if (OP(scan) == EXACTFL)
2763 data->start_class->flags |= ANYOF_LOCALE;
2764 }
2765 }
2766 else if (flags & SCF_DO_STCLASS_OR) {
2767 if (data->start_class->flags & ANYOF_FOLD) {
2768 /* false positive possible if the class is case-folded.
2769 Assume that the locale settings are the same... */
1aa99e6b
IH
2770 if (uc < 0x100)
2771 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2772 data->start_class->flags &= ~ANYOF_EOS;
2773 }
40d049e4 2774 cl_and(data->start_class, and_withp);
653099ff
GS
2775 }
2776 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2777 }
bfed75c6 2778 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2779 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2780 I32 f = flags, pos_before = 0;
d4c19fe8 2781 regnode * const oscan = scan;
653099ff
GS
2782 struct regnode_charclass_class this_class;
2783 struct regnode_charclass_class *oclass = NULL;
727f22e3 2784 I32 next_is_eval = 0;
653099ff 2785
3dab1dad 2786 switch (PL_regkind[OP(scan)]) {
653099ff 2787 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2788 scan = NEXTOPER(scan);
2789 goto finish;
2790 case PLUS:
653099ff 2791 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2792 next = NEXTOPER(scan);
653099ff 2793 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2794 mincount = 1;
2795 maxcount = REG_INFTY;
c277df42
IZ
2796 next = regnext(scan);
2797 scan = NEXTOPER(scan);
2798 goto do_curly;
2799 }
2800 }
2801 if (flags & SCF_DO_SUBSTR)
2802 data->pos_min++;
2803 min++;
2804 /* Fall through. */
2805 case STAR:
653099ff
GS
2806 if (flags & SCF_DO_STCLASS) {
2807 mincount = 0;
b81d288d 2808 maxcount = REG_INFTY;
653099ff
GS
2809 next = regnext(scan);
2810 scan = NEXTOPER(scan);
2811 goto do_curly;
2812 }
b81d288d 2813 is_inf = is_inf_internal = 1;
c277df42
IZ
2814 scan = regnext(scan);
2815 if (flags & SCF_DO_SUBSTR) {
1de06328 2816 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2817 data->longest = &(data->longest_float);
2818 }
2819 goto optimize_curly_tail;
2820 case CURLY:
40d049e4
YO
2821 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2822 && (scan->flags == stopparen))
2823 {
2824 mincount = 1;
2825 maxcount = 1;
2826 } else {
2827 mincount = ARG1(scan);
2828 maxcount = ARG2(scan);
2829 }
c277df42 2830 next = regnext(scan);
cb434fcc
IZ
2831 if (OP(scan) == CURLYX) {
2832 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2833 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2834 }
c277df42 2835 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2836 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2837 do_curly:
2838 if (flags & SCF_DO_SUBSTR) {
1de06328 2839 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2840 pos_before = data->pos_min;
2841 }
2842 if (data) {
2843 fl = data->flags;
2844 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2845 if (is_inf)
2846 data->flags |= SF_IS_INF;
2847 }
653099ff 2848 if (flags & SCF_DO_STCLASS) {
830247a4 2849 cl_init(pRExC_state, &this_class);
653099ff
GS
2850 oclass = data->start_class;
2851 data->start_class = &this_class;
2852 f |= SCF_DO_STCLASS_AND;
2853 f &= ~SCF_DO_STCLASS_OR;
2854 }
e1901655
IZ
2855 /* These are the cases when once a subexpression
2856 fails at a particular position, it cannot succeed
2857 even after backtracking at the enclosing scope.
b81d288d 2858
e1901655
IZ
2859 XXXX what if minimal match and we are at the
2860 initial run of {n,m}? */
2861 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2862 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2863
c277df42 2864 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2865 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2866 last, data, stopparen, recursed, NULL,
a3621e74
YO
2867 (mincount == 0
2868 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2869
2870 if (flags & SCF_DO_STCLASS)
2871 data->start_class = oclass;
2872 if (mincount == 0 || minnext == 0) {
2873 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2874 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2875 }
2876 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2877 /* Switch to OR mode: cache the old value of
653099ff 2878 * data->start_class */
40d049e4
YO
2879 INIT_AND_WITHP;
2880 StructCopy(data->start_class, and_withp,
653099ff
GS
2881 struct regnode_charclass_class);
2882 flags &= ~SCF_DO_STCLASS_AND;
2883 StructCopy(&this_class, data->start_class,
2884 struct regnode_charclass_class);
2885 flags |= SCF_DO_STCLASS_OR;
2886 data->start_class->flags |= ANYOF_EOS;
2887 }
2888 } else { /* Non-zero len */
2889 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2890 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2891 cl_and(data->start_class, and_withp);
653099ff
GS
2892 }
2893 else if (flags & SCF_DO_STCLASS_AND)
2894 cl_and(data->start_class, &this_class);
2895 flags &= ~SCF_DO_STCLASS;
2896 }
c277df42
IZ
2897 if (!scan) /* It was not CURLYX, but CURLY. */
2898 scan = next;
041457d9
DM
2899 if ( /* ? quantifier ok, except for (?{ ... }) */
2900 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2901 && (minnext == 0) && (deltanext == 0)
99799961 2902 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2903 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2904 && ckWARN(WARN_REGEXP))
b45f050a 2905 {
830247a4 2906 vWARN(RExC_parse,
b45f050a
JF
2907 "Quantifier unexpected on zero-length expression");
2908 }
2909
c277df42 2910 min += minnext * mincount;
b81d288d 2911 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2912 && (minnext + deltanext) > 0)
2913 || deltanext == I32_MAX);
aca2d497 2914 is_inf |= is_inf_internal;
c277df42
IZ
2915 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2916
2917 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2918 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2919 && data->flags & SF_IN_PAR
2920 && !(data->flags & SF_HAS_EVAL)
2921 && !deltanext && minnext == 1 ) {
2922 /* Try to optimize to CURLYN. */
2923 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2924 regnode * const nxt1 = nxt;
497b47a8
JH
2925#ifdef DEBUGGING
2926 regnode *nxt2;
2927#endif
c277df42
IZ
2928
2929 /* Skip open. */
2930 nxt = regnext(nxt);
bfed75c6 2931 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2932 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2933 && STR_LEN(nxt) == 1))
c277df42 2934 goto nogo;
497b47a8 2935#ifdef DEBUGGING
c277df42 2936 nxt2 = nxt;
497b47a8 2937#endif
c277df42 2938 nxt = regnext(nxt);
b81d288d 2939 if (OP(nxt) != CLOSE)
c277df42 2940 goto nogo;
40d049e4
YO
2941 if (RExC_open_parens) {
2942 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2943 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2944 }
c277df42 2945 /* Now we know that nxt2 is the only contents: */
eb160463 2946 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2947 OP(oscan) = CURLYN;
2948 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2949
c277df42
IZ
2950#ifdef DEBUGGING
2951 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2952 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2953 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2954 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2955 OP(nxt + 1) = OPTIMIZED; /* was count. */
2956 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2957#endif
c277df42 2958 }
c277df42
IZ
2959 nogo:
2960
2961 /* Try optimization CURLYX => CURLYM. */
b81d288d 2962 if ( OP(oscan) == CURLYX && data
c277df42 2963 && !(data->flags & SF_HAS_PAR)
c277df42 2964 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2965 && !deltanext /* atom is fixed width */
2966 && minnext != 0 /* CURLYM can't handle zero width */
2967 ) {
c277df42
IZ
2968 /* XXXX How to optimize if data == 0? */
2969 /* Optimize to a simpler form. */
2970 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2971 regnode *nxt2;
2972
2973 OP(oscan) = CURLYM;
2974 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2975 && (OP(nxt2) != WHILEM))
c277df42
IZ
2976 nxt = nxt2;
2977 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2978 /* Need to optimize away parenths. */
2979 if (data->flags & SF_IN_PAR) {
2980 /* Set the parenth number. */
2981 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2982
b81d288d 2983 if (OP(nxt) != CLOSE)
b45f050a 2984 FAIL("Panic opt close");
eb160463 2985 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2986 if (RExC_open_parens) {
2987 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2988 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2989 }
c277df42
IZ
2990 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2991 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2992
c277df42
IZ
2993#ifdef DEBUGGING
2994 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2995 OP(nxt + 1) = OPTIMIZED; /* was count. */
2996 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2997 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2998#endif
c277df42
IZ
2999#if 0
3000 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3001 regnode *nnxt = regnext(nxt1);
b81d288d 3002
c277df42
IZ
3003 if (nnxt == nxt) {
3004 if (reg_off_by_arg[OP(nxt1)])
3005 ARG_SET(nxt1, nxt2 - nxt1);
3006 else if (nxt2 - nxt1 < U16_MAX)
3007 NEXT_OFF(nxt1) = nxt2 - nxt1;
3008 else
3009 OP(nxt) = NOTHING; /* Cannot beautify */
3010 }
3011 nxt1 = nnxt;
3012 }
3013#endif
3014 /* Optimize again: */
1de06328 3015 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3016 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3017 }
3018 else
c277df42 3019 oscan->flags = 0;
c277df42 3020 }
e1901655
IZ
3021 else if ((OP(oscan) == CURLYX)
3022 && (flags & SCF_WHILEM_VISITED_POS)
3023 /* See the comment on a similar expression above.
3024 However, this time it not a subexpression
3025 we care about, but the expression itself. */
3026 && (maxcount == REG_INFTY)
3027 && data && ++data->whilem_c < 16) {
3028 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3029 /* Find WHILEM (as in regexec.c) */
3030 regnode *nxt = oscan + NEXT_OFF(oscan);
3031
3032 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3033 nxt += ARG(nxt);
eb160463
GS
3034 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3035 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3036 }
b81d288d 3037 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3038 pars++;
3039 if (flags & SCF_DO_SUBSTR) {
c445ea15 3040 SV *last_str = NULL;
c277df42
IZ
3041 int counted = mincount != 0;
3042
3043 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3044#if defined(SPARC64_GCC_WORKAROUND)
3045 I32 b = 0;
3046 STRLEN l = 0;
cfd0369c 3047 const char *s = NULL;
5d1c421c
JH
3048 I32 old = 0;
3049
3050 if (pos_before >= data->last_start_min)
3051 b = pos_before;
3052 else
3053 b = data->last_start_min;
3054
3055 l = 0;
cfd0369c 3056 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3057 old = b - data->last_start_min;
3058
3059#else
b81d288d 3060 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3061 ? pos_before : data->last_start_min;
3062 STRLEN l;
d4c19fe8 3063 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3064 I32 old = b - data->last_start_min;
5d1c421c 3065#endif
a0ed51b3
LW
3066
3067 if (UTF)
3068 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3069
a0ed51b3 3070 l -= old;
c277df42 3071 /* Get the added string: */
79cb57f6 3072 last_str = newSVpvn(s + old, l);
0e933229
IH
3073 if (UTF)
3074 SvUTF8_on(last_str);
c277df42
IZ
3075 if (deltanext == 0 && pos_before == b) {
3076 /* What was added is a constant string */
3077 if (mincount > 1) {
3078 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3079 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3080 SvPVX_const(last_str), l, mincount - 1);
b162af07 3081 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3082 /* Add additional parts. */
b81d288d 3083 SvCUR_set(data->last_found,
c277df42
IZ
3084 SvCUR(data->last_found) - l);
3085 sv_catsv(data->last_found, last_str);
0eda9292
JH
3086 {
3087 SV * sv = data->last_found;
3088 MAGIC *mg =
3089 SvUTF8(sv) && SvMAGICAL(sv) ?
3090 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3091 if (mg && mg->mg_len >= 0)
3092 mg->mg_len += CHR_SVLEN(last_str);
3093 }
c277df42
IZ
3094 data->last_end += l * (mincount - 1);
3095 }
2a8d9689
HS
3096 } else {
3097 /* start offset must point into the last copy */
3098 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3099 data->last_start_max += is_inf ? I32_MAX
3100 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3101 }
3102 }
3103 /* It is counted once already... */
3104 data->pos_min += minnext * (mincount - counted);
3105 data->pos_delta += - counted * deltanext +
3106 (minnext + deltanext) * maxcount - minnext * mincount;
3107 if (mincount != maxcount) {
653099ff
GS
3108 /* Cannot extend fixed substrings found inside
3109 the group. */
1de06328 3110 scan_commit(pRExC_state,data,minlenp);
c277df42 3111 if (mincount && last_str) {
d4c19fe8
AL
3112 SV * const sv = data->last_found;
3113 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3114 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3115
3116 if (mg)
3117 mg->mg_len = -1;
3118 sv_setsv(sv, last_str);
c277df42 3119 data->last_end = data->pos_min;
b81d288d 3120 data->last_start_min =
a0ed51b3 3121 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3122 data->last_start_max = is_inf
3123 ? I32_MAX
c277df42 3124 : data->pos_min + data->pos_delta
a0ed51b3 3125 - CHR_SVLEN(last_str);
c277df42
IZ
3126 }
3127 data->longest = &(data->longest_float);
3128 }
aca2d497 3129 SvREFCNT_dec(last_str);
c277df42 3130 }
405ff068 3131 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3132 data->flags |= SF_HAS_EVAL;
3133 optimize_curly_tail:
c277df42 3134 if (OP(oscan) != CURLYX) {
3dab1dad 3135 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3136 && NEXT_OFF(next))
3137 NEXT_OFF(oscan) += NEXT_OFF(next);
3138 }
c277df42 3139 continue;
653099ff 3140 default: /* REF and CLUMP only? */
c277df42 3141 if (flags & SCF_DO_SUBSTR) {
1de06328 3142 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3143 data->longest = &(data->longest_float);
3144 }
aca2d497 3145 is_inf = is_inf_internal = 1;
653099ff 3146 if (flags & SCF_DO_STCLASS_OR)
830247a4 3147 cl_anything(pRExC_state, data->start_class);
653099ff 3148 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3149 break;
3150 }
a0ed51b3 3151 }
bfed75c6 3152 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3153 int value = 0;
653099ff 3154
c277df42 3155 if (flags & SCF_DO_SUBSTR) {
1de06328 3156 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3157 data->pos_min++;
3158 }
3159 min++;
653099ff
GS
3160 if (flags & SCF_DO_STCLASS) {
3161 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3162
3163 /* Some of the logic below assumes that switching
3164 locale on will only add false positives. */
3dab1dad 3165 switch (PL_regkind[OP(scan)]) {
653099ff 3166 case SANY:
653099ff
GS
3167 default:
3168 do_default:
3169 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3170 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3171 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3172 break;
3173 case REG_ANY:
3174 if (OP(scan) == SANY)
3175 goto do_default;
3176 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3177 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3178 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3179 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3180 }
3181 if (flags & SCF_DO_STCLASS_AND || !value)
3182 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3183 break;
3184 case ANYOF:
3185 if (flags & SCF_DO_STCLASS_AND)
3186 cl_and(data->start_class,
3187 (struct regnode_charclass_class*)scan);
3188 else
830247a4 3189 cl_or(pRExC_state, data->start_class,
653099ff
GS
3190 (struct regnode_charclass_class*)scan);
3191 break;
3192 case ALNUM:
3193 if (flags & SCF_DO_STCLASS_AND) {
3194 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3195 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3196 for (value = 0; value < 256; value++)
3197 if (!isALNUM(value))
3198 ANYOF_BITMAP_CLEAR(data->start_class, value);
3199 }
3200 }
3201 else {
3202 if (data->start_class->flags & ANYOF_LOCALE)
3203 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3204 else {
3205 for (value = 0; value < 256; value++)
3206 if (isALNUM(value))
b81d288d 3207 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3208 }
3209 }
3210 break;
3211 case ALNUML:
3212 if (flags & SCF_DO_STCLASS_AND) {
3213 if (data->start_class->flags & ANYOF_LOCALE)
3214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3215 }
3216 else {
3217 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3218 data->start_class->flags |= ANYOF_LOCALE;
3219 }
3220 break;
3221 case NALNUM:
3222 if (flags & SCF_DO_STCLASS_AND) {
3223 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3224 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3225 for (value = 0; value < 256; value++)
3226 if (isALNUM(value))
3227 ANYOF_BITMAP_CLEAR(data->start_class, value);
3228 }
3229 }
3230 else {
3231 if (data->start_class->flags & ANYOF_LOCALE)
3232 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3233 else {
3234 for (value = 0; value < 256; value++)
3235 if (!isALNUM(value))
b81d288d 3236 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3237 }
3238 }
3239 break;
3240 case NALNUML:
3241 if (flags & SCF_DO_STCLASS_AND) {
3242 if (data->start_class->flags & ANYOF_LOCALE)
3243 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3244 }
3245 else {
3246 data->start_class->flags |= ANYOF_LOCALE;
3247 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3248 }
3249 break;
3250 case SPACE:
3251 if (flags & SCF_DO_STCLASS_AND) {
3252 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3253 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3254 for (value = 0; value < 256; value++)
3255 if (!isSPACE(value))
3256 ANYOF_BITMAP_CLEAR(data->start_class, value);
3257 }
3258 }
3259 else {
3260 if (data->start_class->flags & ANYOF_LOCALE)
3261 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3262 else {
3263 for (value = 0; value < 256; value++)
3264 if (isSPACE(value))
b81d288d 3265 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3266 }
3267 }
3268 break;
3269 case SPACEL:
3270 if (flags & SCF_DO_STCLASS_AND) {
3271 if (data->start_class->flags & ANYOF_LOCALE)
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3273 }
3274 else {
3275 data->start_class->flags |= ANYOF_LOCALE;
3276 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3277 }
3278 break;
3279 case NSPACE:
3280 if (flags & SCF_DO_STCLASS_AND) {
3281 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3282 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3283 for (value = 0; value < 256; value++)
3284 if (isSPACE(value))
3285 ANYOF_BITMAP_CLEAR(data->start_class, value);
3286 }
3287 }
3288 else {
3289 if (data->start_class->flags & ANYOF_LOCALE)
3290 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3291 else {
3292 for (value = 0; value < 256; value++)
3293 if (!isSPACE(value))
b81d288d 3294 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3295 }
3296 }
3297 break;
3298 case NSPACEL:
3299 if (flags & SCF_DO_STCLASS_AND) {
3300 if (data->start_class->flags & ANYOF_LOCALE) {
3301 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3302 for (value = 0; value < 256; value++)
3303 if (!isSPACE(value))
3304 ANYOF_BITMAP_CLEAR(data->start_class, value);
3305 }
3306 }
3307 else {
3308 data->start_class->flags |= ANYOF_LOCALE;
3309 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3310 }
3311 break;
3312 case DIGIT:
3313 if (flags & SCF_DO_STCLASS_AND) {
3314 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3315 for (value = 0; value < 256; value++)
3316 if (!isDIGIT(value))
3317 ANYOF_BITMAP_CLEAR(data->start_class, value);
3318 }
3319 else {
3320 if (data->start_class->flags & ANYOF_LOCALE)
3321 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3322 else {
3323 for (value = 0; value < 256; value++)
3324 if (isDIGIT(value))
b81d288d 3325 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3326 }
3327 }
3328 break;
3329 case NDIGIT:
3330 if (flags & SCF_DO_STCLASS_AND) {
3331 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3332 for (value = 0; value < 256; value++)
3333 if (isDIGIT(value))
3334 ANYOF_BITMAP_CLEAR(data->start_class, value);
3335 }
3336 else {
3337 if (data->start_class->flags & ANYOF_LOCALE)
3338 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3339 else {
3340 for (value = 0; value < 256; value++)
3341 if (!isDIGIT(value))
b81d288d 3342 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3343 }
3344 }
3345 break;
3346 }
3347 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3348 cl_and(data->start_class, and_withp);
653099ff
GS
3349 flags &= ~SCF_DO_STCLASS;
3350 }
a0ed51b3 3351 }
3dab1dad 3352 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3353 data->flags |= (OP(scan) == MEOL
3354 ? SF_BEFORE_MEOL
3355 : SF_BEFORE_SEOL);
a0ed51b3 3356 }
3dab1dad 3357 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3358 /* Lookbehind, or need to calculate parens/evals/stclass: */
3359 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3360 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3361 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3362 || OP(scan) == UNLESSM )
3363 {
3364 /* Negative Lookahead/lookbehind
3365 In this case we can't do fixed string optimisation.
3366 */
3367
3368 I32 deltanext, minnext, fake = 0;
3369 regnode *nscan;
3370 struct regnode_charclass_class intrnl;
3371 int f = 0;
3372
3373 data_fake.flags = 0;
3374 if (data) {
3375 data_fake.whilem_c = data->whilem_c;
3376 data_fake.last_closep = data->last_closep;
a0ed51b3 3377 }
1de06328
YO
3378 else
3379 data_fake.last_closep = &fake;
3380 if ( flags & SCF_DO_STCLASS && !scan->flags
3381 && OP(scan) == IFMATCH ) { /* Lookahead */
3382 cl_init(pRExC_state, &intrnl);
3383 data_fake.start_class = &intrnl;
3384 f |= SCF_DO_STCLASS_AND;
c277df42 3385 }
1de06328
YO
3386 if (flags & SCF_WHILEM_VISITED_POS)
3387 f |= SCF_WHILEM_VISITED_POS;
3388 next = regnext(scan);
3389 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3390 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3391 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3392 if (scan->flags) {
3393 if (deltanext) {
3394 vFAIL("Variable length lookbehind not implemented");
3395 }
3396 else if (minnext > (I32)U8_MAX) {
3397 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3398 }
3399 scan->flags = (U8)minnext;
3400 }
3401 if (data) {
3402 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3403 pars++;
3404 if (data_fake.flags & SF_HAS_EVAL)
3405 data->flags |= SF_HAS_EVAL;
3406 data->whilem_c = data_fake.whilem_c;
3407 }
3408 if (f & SCF_DO_STCLASS_AND) {
3409 const int was = (data->start_class->flags & ANYOF_EOS);
3410
3411 cl_and(data->start_class, &intrnl);
3412 if (was)
3413 data->start_class->flags |= ANYOF_EOS;
3414 }
be8e71aa 3415 }
1de06328
YO
3416#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3417 else {
3418 /* Positive Lookahead/lookbehind
3419 In this case we can do fixed string optimisation,
3420 but we must be careful about it. Note in the case of
3421 lookbehind the positions will be offset by the minimum
3422 length of the pattern, something we won't know about
3423 until after the recurse.
3424 */
3425 I32 deltanext, fake = 0;
3426 regnode *nscan;
3427 struct regnode_charclass_class intrnl;
3428 int f = 0;
3429 /* We use SAVEFREEPV so that when the full compile
3430 is finished perl will clean up the allocated
3431 minlens when its all done. This was we don't
3432 have to worry about freeing them when we know
3433 they wont be used, which would be a pain.
3434 */
3435 I32 *minnextp;
3436 Newx( minnextp, 1, I32 );
3437 SAVEFREEPV(minnextp);
3438
3439 if (data) {
3440 StructCopy(data, &data_fake, scan_data_t);
3441 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3442 f |= SCF_DO_SUBSTR;
3443 if (scan->flags)
3444 scan_commit(pRExC_state, &data_fake,minlenp);
3445 data_fake.last_found=newSVsv(data->last_found);
3446 }
3447 }
3448 else
3449 data_fake.last_closep = &fake;
3450 data_fake.flags = 0;
3451 if (is_inf)
3452 data_fake.flags |= SF_IS_INF;
3453 if ( flags & SCF_DO_STCLASS && !scan->flags
3454 && OP(scan) == IFMATCH ) { /* Lookahead */
3455 cl_init(pRExC_state, &intrnl);
3456 data_fake.start_class = &intrnl;
3457 f |= SCF_DO_STCLASS_AND;
3458 }
3459 if (flags & SCF_WHILEM_VISITED_POS)
3460 f |= SCF_WHILEM_VISITED_POS;
3461 next = regnext(scan);
3462 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3463
3464 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3465 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3466 if (scan->flags) {
3467 if (deltanext) {
3468 vFAIL("Variable length lookbehind not implemented");
3469 }
3470 else if (*minnextp > (I32)U8_MAX) {
3471 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3472 }
3473 scan->flags = (U8)*minnextp;
3474 }
40d049e4 3475
1de06328 3476 *minnextp += min;
40d049e4 3477
1de06328
YO
3478 if (f & SCF_DO_STCLASS_AND) {
3479 const int was = (data->start_class->flags & ANYOF_EOS);
3480
3481 cl_and(data->start_class, &intrnl);
3482 if (was)
3483 data->start_class->flags |= ANYOF_EOS;
40d049e4 3484 }
1de06328
YO
3485 if (data) {
3486 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3487 pars++;
3488 if (data_fake.flags & SF_HAS_EVAL)
3489 data->flags |= SF_HAS_EVAL;
3490 data->whilem_c = data_fake.whilem_c;
3491 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3492 if (RExC_rx->minlen<*minnextp)
3493 RExC_rx->minlen=*minnextp;
3494 scan_commit(pRExC_state, &data_fake, minnextp);
3495 SvREFCNT_dec(data_fake.last_found);
3496
3497 if ( data_fake.minlen_fixed != minlenp )
3498 {
3499 data->offset_fixed= data_fake.offset_fixed;
3500 data->minlen_fixed= data_fake.minlen_fixed;
3501 data->lookbehind_fixed+= scan->flags;
3502 }
3503 if ( data_fake.minlen_float != minlenp )
3504 {
3505 data->minlen_float= data_fake.minlen_float;
3506 data->offset_float_min=data_fake.offset_float_min;
3507 data->offset_float_max=data_fake.offset_float_max;
3508 data->lookbehind_float+= scan->flags;
3509 }
3510 }
3511 }
3512
653099ff 3513
653099ff 3514 }
1de06328 3515#endif
a0ed51b3
LW
3516 }
3517 else if (OP(scan) == OPEN) {
40d049e4
YO
3518 if (stopparen != (I32)ARG(scan))
3519 pars++;
a0ed51b3 3520 }
cb434fcc 3521 else if (OP(scan) == CLOSE) {
40d049e4
YO
3522 if (stopparen == (I32)ARG(scan)) {
3523 break;
3524 }
eb160463 3525 if ((I32)ARG(scan) == is_par) {
cb434fcc 3526 next = regnext(scan);
c277df42 3527
cb434fcc
IZ
3528 if ( next && (OP(next) != WHILEM) && next < last)
3529 is_par = 0; /* Disable optimization */
3530 }
3531 if (data)
3532 *(data->last_closep) = ARG(scan);
a0ed51b3 3533 }
1a147d38 3534 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3535 /* set the pointer */
3536 I32 paren;
3537 regnode *start;
3538 regnode *end;
1a147d38 3539 if (OP(scan) == GOSUB) {
40d049e4
YO
3540 paren = ARG(scan);
3541 RExC_recurse[ARG2L(scan)] = scan;
3542 start = RExC_open_parens[paren-1];
3543 end = RExC_close_parens[paren-1];
3544 } else {
3545 paren = 0;
3546 start = RExC_rx->program + 1;
3547 end = RExC_opend;
3548 }
3549 assert(start);
3550 assert(end);
3551 if (!recursed) {
3552 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3553 SAVEFREEPV(recursed);
3554 }
3555 if (!PAREN_TEST(recursed,paren+1)) {
3556 I32 deltanext = 0;
3557 PAREN_SET(recursed,paren+1);
3558
3559 DEBUG_PEEP("goto",start,depth);
3560 min += study_chunk(
3561 pRExC_state,
3562 &start,
3563 minlenp,
3564 &deltanext,
3565 end+1,
3566 data,
3567 paren,
3568 recursed,
3569 and_withp,
3570 flags,depth+1);
3571 delta+=deltanext;
3572 if (deltanext == I32_MAX) {
3573 is_inf = is_inf_internal = 1;
3574 delta=deltanext;
3575 }
3576 DEBUG_PEEP("rtrn",end,depth);
3577 PAREN_UNSET(recursed,paren+1);
3578 } else {
3579 if (flags & SCF_DO_SUBSTR) {
3580 scan_commit(pRExC_state,data,minlenp);
3581 data->longest = &(data->longest_float);
3582 }
3583 is_inf = is_inf_internal = 1;
3584 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3585 cl_anything(pRExC_state, data->start_class);
3586 flags &= ~SCF_DO_STCLASS;
3587 }
3588 }
a0ed51b3 3589 else if (OP(scan) == EVAL) {
c277df42
IZ
3590 if (data)
3591 data->flags |= SF_HAS_EVAL;
3592 }
e2e6a0f1 3593 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
40d049e4
YO
3594 if (flags & SCF_DO_SUBSTR) {
3595 scan_commit(pRExC_state,data,minlenp);
3596 flags &= ~SCF_DO_SUBSTR;
3597 }
e2e6a0f1
YO
3598 if (data && OP(scan)==ACCEPT) {
3599 data->flags |= SCF_SEEN_ACCEPT;
3600 if (stopmin > min)
3601 stopmin = min;
3602 }
40d049e4
YO
3603 }
3604 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3605 {
0f5d15d6 3606 if (flags & SCF_DO_SUBSTR) {
1de06328 3607 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3608 data->longest = &(data->longest_float);
3609 }
3610 is_inf = is_inf_internal = 1;
653099ff 3611 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3612 cl_anything(pRExC_state, data->start_class);
96776eda 3613 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3614 }
786e8c11 3615#ifdef TRIE_STUDY_OPT
40d049e4 3616#ifdef FULL_TRIE_STUDY
786e8c11
YO
3617 else if (PL_regkind[OP(scan)] == TRIE) {
3618 /* NOTE - There is similar code to this block above for handling
40d049e4 3619 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3620 check there too. */
7f69552c 3621 regnode *trie_node= scan;
786e8c11
YO
3622 regnode *tail= regnext(scan);
3623 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3624 I32 max1 = 0, min1 = I32_MAX;
3625 struct regnode_charclass_class accum;
3626
3627 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3628 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3629 if (flags & SCF_DO_STCLASS)
3630 cl_init_zero(pRExC_state, &accum);
3631
3632 if (!trie->jump) {
3633 min1= trie->minlen;
3634 max1= trie->maxlen;
3635 } else {
3636 const regnode *nextbranch= NULL;
3637 U32 word;
3638
3639 for ( word=1 ; word <= trie->wordcount ; word++)
3640 {
3641 I32 deltanext=0, minnext=0, f = 0, fake;
3642 struct regnode_charclass_class this_class;
3643
3644 data_fake.flags = 0;
3645 if (data) {
3646 data_fake.whilem_c = data->whilem_c;
3647 data_fake.last_closep = data->last_closep;
3648 }
3649 else
3650 data_fake.last_closep = &fake;
3651
3652 if (flags & SCF_DO_STCLASS) {
3653 cl_init(pRExC_state, &this_class);
3654 data_fake.start_class = &this_class;
3655 f = SCF_DO_STCLASS_AND;
3656 }
3657 if (flags & SCF_WHILEM_VISITED_POS)
3658 f |= SCF_WHILEM_VISITED_POS;
3659
3660 if (trie->jump[word]) {
3661 if (!nextbranch)
7f69552c
YO
3662 nextbranch = trie_node + trie->jump[0];
3663 scan= trie_node + trie->jump[word];
786e8c11
YO
3664 /* We go from the jump point to the branch that follows
3665 it. Note this means we need the vestigal unused branches
3666 even though they arent otherwise used.
3667 */
40d049e4
YO
3668 minnext = study_chunk(pRExC_state, &scan, minlenp,
3669 &deltanext, (regnode *)nextbranch, &data_fake,
3670 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3671 }
3672 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3673 nextbranch= regnext((regnode*)nextbranch);
3674
3675 if (min1 > (I32)(minnext + trie->minlen))
3676 min1 = minnext + trie->minlen;
3677 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3678 max1 = minnext + deltanext + trie->maxlen;
3679 if (deltanext == I32_MAX)
3680 is_inf = is_inf_internal = 1;
3681
3682 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3683 pars++;
e2e6a0f1
YO
3684 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3685 if ( stopmin > min + min1)
3686 stopmin = min + min1;
3687 flags &= ~SCF_DO_SUBSTR;
3688 if (data)
3689 data->flags |= SCF_SEEN_ACCEPT;
3690 }
786e8c11
YO
3691 if (data) {
3692 if (data_fake.flags & SF_HAS_EVAL)
3693 data->flags |= SF_HAS_EVAL;
3694 data->whilem_c = data_fake.whilem_c;
3695 }
3696 if (flags & SCF_DO_STCLASS)
3697 cl_or(pRExC_state, &accum, &this_class);
3698 }