This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fwd: Memory leak with s/// and hashes
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
e2e6a0f1 117 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
118 I32 extralen;
119 I32 seen_zerolen;
120 I32 seen_evals;
40d049e4
YO
121 regnode **open_parens; /* pointers to open parens */
122 regnode **close_parens; /* pointers to close parens */
123 regnode *opend; /* END node in program */
1aa99e6b 124 I32 utf8;
6bda09f9 125 HV *charnames; /* cache of named sequences */
81714fb9 126 HV *paren_names; /* Paren names */
40d049e4
YO
127 regnode **recurse; /* Recurse regops */
128 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
3dab1dad 133#ifdef DEBUGGING
be8e71aa 134 const char *lastparse;
3dab1dad
YO
135 I32 lastnum;
136#define RExC_lastparse (pRExC_state->lastparse)
137#define RExC_lastnum (pRExC_state->lastnum)
138#endif
830247a4
IZ
139} RExC_state_t;
140
e2509266 141#define RExC_flags (pRExC_state->flags)
830247a4
IZ
142#define RExC_precomp (pRExC_state->precomp)
143#define RExC_rx (pRExC_state->rx)
fac92740 144#define RExC_start (pRExC_state->start)
830247a4
IZ
145#define RExC_end (pRExC_state->end)
146#define RExC_parse (pRExC_state->parse)
147#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 148#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 149#define RExC_emit (pRExC_state->emit)
fac92740 150#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
151#define RExC_naughty (pRExC_state->naughty)
152#define RExC_sawback (pRExC_state->sawback)
153#define RExC_seen (pRExC_state->seen)
154#define RExC_size (pRExC_state->size)
155#define RExC_npar (pRExC_state->npar)
e2e6a0f1 156#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
157#define RExC_extralen (pRExC_state->extralen)
158#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 160#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 161#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
162#define RExC_open_parens (pRExC_state->open_parens)
163#define RExC_close_parens (pRExC_state->close_parens)
164#define RExC_opend (pRExC_state->opend)
81714fb9 165#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
166#define RExC_recurse (pRExC_state->recurse)
167#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 168
a687059c
LW
169#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
170#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
171 ((*s) == '{' && regcurly(s)))
a687059c 172
35c8bce7
LW
173#ifdef SPSTART
174#undef SPSTART /* dratted cpp namespace... */
175#endif
a687059c
LW
176/*
177 * Flags to be passed up and down.
178 */
a687059c 179#define WORST 0 /* Worst case. */
821b33a5 180#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
181#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
182#define SPSTART 0x4 /* Starts with * or +. */
183#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 184
3dab1dad
YO
185#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
186
07be1b83
YO
187/* whether trie related optimizations are enabled */
188#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
189#define TRIE_STUDY_OPT
786e8c11 190#define FULL_TRIE_STUDY
07be1b83
YO
191#define TRIE_STCLASS
192#endif
1de06328
YO
193
194
40d049e4
YO
195
196#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
197#define PBITVAL(paren) (1 << ((paren) & 7))
198#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
199#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
200#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
201
202
1de06328
YO
203/* About scan_data_t.
204
205 During optimisation we recurse through the regexp program performing
206 various inplace (keyhole style) optimisations. In addition study_chunk
207 and scan_commit populate this data structure with information about
208 what strings MUST appear in the pattern. We look for the longest
209 string that must appear for at a fixed location, and we look for the
210 longest string that may appear at a floating location. So for instance
211 in the pattern:
212
213 /FOO[xX]A.*B[xX]BAR/
214
215 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
216 strings (because they follow a .* construct). study_chunk will identify
217 both FOO and BAR as being the longest fixed and floating strings respectively.
218
219 The strings can be composites, for instance
220
221 /(f)(o)(o)/
222
223 will result in a composite fixed substring 'foo'.
224
225 For each string some basic information is maintained:
226
227 - offset or min_offset
228 This is the position the string must appear at, or not before.
229 It also implicitly (when combined with minlenp) tells us how many
230 character must match before the string we are searching.
231 Likewise when combined with minlenp and the length of the string
232 tells us how many characters must appear after the string we have
233 found.
234
235 - max_offset
236 Only used for floating strings. This is the rightmost point that
237 the string can appear at. Ifset to I32 max it indicates that the
238 string can occur infinitely far to the right.
239
240 - minlenp
241 A pointer to the minimum length of the pattern that the string
242 was found inside. This is important as in the case of positive
243 lookahead or positive lookbehind we can have multiple patterns
244 involved. Consider
245
246 /(?=FOO).*F/
247
248 The minimum length of the pattern overall is 3, the minimum length
249 of the lookahead part is 3, but the minimum length of the part that
250 will actually match is 1. So 'FOO's minimum length is 3, but the
251 minimum length for the F is 1. This is important as the minimum length
252 is used to determine offsets in front of and behind the string being
253 looked for. Since strings can be composites this is the length of the
254 pattern at the time it was commited with a scan_commit. Note that
255 the length is calculated by study_chunk, so that the minimum lengths
256 are not known until the full pattern has been compiled, thus the
257 pointer to the value.
258
259 - lookbehind
260
261 In the case of lookbehind the string being searched for can be
262 offset past the start point of the final matching string.
263 If this value was just blithely removed from the min_offset it would
264 invalidate some of the calculations for how many chars must match
265 before or after (as they are derived from min_offset and minlen and
266 the length of the string being searched for).
267 When the final pattern is compiled and the data is moved from the
268 scan_data_t structure into the regexp structure the information
269 about lookbehind is factored in, with the information that would
270 have been lost precalculated in the end_shift field for the
271 associated string.
272
273 The fields pos_min and pos_delta are used to store the minimum offset
274 and the delta to the maximum offset at the current point in the pattern.
275
276*/
2c2d71f5
JH
277
278typedef struct scan_data_t {
1de06328
YO
279 /*I32 len_min; unused */
280 /*I32 len_delta; unused */
2c2d71f5
JH
281 I32 pos_min;
282 I32 pos_delta;
283 SV *last_found;
1de06328 284 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
285 I32 last_start_min;
286 I32 last_start_max;
1de06328
YO
287 SV **longest; /* Either &l_fixed, or &l_float. */
288 SV *longest_fixed; /* longest fixed string found in pattern */
289 I32 offset_fixed; /* offset where it starts */
290 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
291 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
292 SV *longest_float; /* longest floating string found in pattern */
293 I32 offset_float_min; /* earliest point in string it can appear */
294 I32 offset_float_max; /* latest point in string it can appear */
295 I32 *minlen_float; /* pointer to the minlen relevent to the string */
296 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
297 I32 flags;
298 I32 whilem_c;
cb434fcc 299 I32 *last_closep;
653099ff 300 struct regnode_charclass_class *start_class;
2c2d71f5
JH
301} scan_data_t;
302
a687059c 303/*
e50aee73 304 * Forward declarations for pregcomp()'s friends.
a687059c 305 */
a0d0e21e 306
27da23d5 307static const scan_data_t zero_scan_data =
1de06328 308 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
309
310#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
311#define SF_BEFORE_SEOL 0x0001
312#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
313#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
314#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
315
09b7f37c
CB
316#ifdef NO_UNARY_PLUS
317# define SF_FIX_SHIFT_EOL (0+2)
318# define SF_FL_SHIFT_EOL (0+4)
319#else
320# define SF_FIX_SHIFT_EOL (+2)
321# define SF_FL_SHIFT_EOL (+4)
322#endif
c277df42
IZ
323
324#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
325#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
326
327#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
328#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
329#define SF_IS_INF 0x0040
330#define SF_HAS_PAR 0x0080
331#define SF_IN_PAR 0x0100
332#define SF_HAS_EVAL 0x0200
333#define SCF_DO_SUBSTR 0x0400
653099ff
GS
334#define SCF_DO_STCLASS_AND 0x0800
335#define SCF_DO_STCLASS_OR 0x1000
336#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 337#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 338
786e8c11 339#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 340#define SCF_SEEN_ACCEPT 0x8000
07be1b83 341
eb160463 342#define UTF (RExC_utf8 != 0)
e2509266
JH
343#define LOC ((RExC_flags & PMf_LOCALE) != 0)
344#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 345
ffc61ed2 346#define OOB_UNICODE 12345678
93733859 347#define OOB_NAMEDCLASS -1
b8c5462f 348
a0ed51b3
LW
349#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
350#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
351
8615cb43 352
b45f050a
JF
353/* length of regex to show in messages that don't mark a position within */
354#define RegexLengthToShowInErrorMessages 127
355
356/*
357 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
358 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
359 * op/pragma/warn/regcomp.
360 */
7253e4e3
RK
361#define MARKER1 "<-- HERE" /* marker as it appears in the description */
362#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 363
7253e4e3 364#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
365
366/*
367 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
368 * arg. Show regex, up to a maximum length. If it's too long, chop and add
369 * "...".
370 */
ccb2c380 371#define FAIL(msg) STMT_START { \
bfed75c6 372 const char *ellipses = ""; \
ccb2c380
MP
373 IV len = RExC_end - RExC_precomp; \
374 \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 if (len > RegexLengthToShowInErrorMessages) { \
378 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
379 len = RegexLengthToShowInErrorMessages - 10; \
380 ellipses = "..."; \
381 } \
382 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
383 msg, (int)len, RExC_precomp, ellipses); \
384} STMT_END
8615cb43 385
b45f050a 386/*
b45f050a
JF
387 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
388 */
ccb2c380 389#define Simple_vFAIL(m) STMT_START { \
a28509cc 390 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
391 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
392 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
b45f050a
JF
394
395/*
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
397 */
ccb2c380
MP
398#define vFAIL(m) STMT_START { \
399 if (!SIZE_ONLY) \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
401 Simple_vFAIL(m); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Like Simple_vFAIL(), but accepts two arguments.
406 */
ccb2c380 407#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 408 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
409 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
410 (int)offset, RExC_precomp, RExC_precomp + offset); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
415 */
ccb2c380
MP
416#define vFAIL2(m,a1) STMT_START { \
417 if (!SIZE_ONLY) \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 Simple_vFAIL2(m, a1); \
420} STMT_END
b45f050a
JF
421
422
423/*
424 * Like Simple_vFAIL(), but accepts three arguments.
425 */
ccb2c380 426#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
434 */
ccb2c380
MP
435#define vFAIL3(m,a1,a2) STMT_START { \
436 if (!SIZE_ONLY) \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
438 Simple_vFAIL3(m, a1, a2); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts four arguments.
443 */
ccb2c380 444#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a 449
ccb2c380 450#define vWARN(loc,m) STMT_START { \
a28509cc 451 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
455
456#define vWARNdep(loc,m) STMT_START { \
a28509cc 457 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
458 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
459 "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
461} STMT_END
462
463
464#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 471 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
473 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
480} STMT_END
481
482#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
485 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
9d1d55b5 487
8615cb43 488
cd439c50 489/* Allow for side effects in s */
ccb2c380
MP
490#define REGC(c,s) STMT_START { \
491 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
492} STMT_END
cd439c50 493
fac92740
MJD
494/* Macros for recording node offsets. 20001227 mjd@plover.com
495 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
496 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
497 * Element 0 holds the number n.
07be1b83 498 * Position is 1 indexed.
fac92740
MJD
499 */
500
ccb2c380
MP
501#define Set_Node_Offset_To_R(node,byte) STMT_START { \
502 if (! SIZE_ONLY) { \
503 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 504 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 505 if((node) < 0) { \
551405c4 506 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
507 } else { \
508 RExC_offsets[2*(node)-1] = (byte); \
509 } \
510 } \
511} STMT_END
512
513#define Set_Node_Offset(node,byte) \
514 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
515#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
516
517#define Set_Node_Length_To_R(node,len) STMT_START { \
518 if (! SIZE_ONLY) { \
519 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 520 __LINE__, (int)(node), (int)(len))); \
ccb2c380 521 if((node) < 0) { \
551405c4 522 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
523 } else { \
524 RExC_offsets[2*(node)] = (len); \
525 } \
526 } \
527} STMT_END
528
529#define Set_Node_Length(node,len) \
530 Set_Node_Length_To_R((node)-RExC_emit_start, len)
531#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
532#define Set_Node_Cur_Length(node) \
533 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
534
535/* Get offsets and lengths */
536#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
537#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
538
07be1b83
YO
539#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
540 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
541 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
542} STMT_END
543
544
545#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
546#define EXPERIMENTAL_INPLACESCAN
547#endif
548
1de06328 549#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 550DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
551 PerlIO_printf(Perl_debug_log, \
552 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
553 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
554 (int)(depth)*2, "", \
555 (IV)((data)->pos_min), \
556 (IV)((data)->pos_delta), \
557 (IV)((data)->flags), \
558 (IV)((data)->whilem_c), \
559 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
560 ); \
561 if ((data)->last_found) \
562 PerlIO_printf(Perl_debug_log, \
563 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
564 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
565 SvPVX_const((data)->last_found), \
566 (IV)((data)->last_end), \
567 (IV)((data)->last_start_min), \
568 (IV)((data)->last_start_max), \
569 ((data)->longest && \
570 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
571 SvPVX_const((data)->longest_fixed), \
572 (IV)((data)->offset_fixed), \
573 ((data)->longest && \
574 (data)->longest==&((data)->longest_float)) ? "*" : "", \
575 SvPVX_const((data)->longest_float), \
576 (IV)((data)->offset_float_min), \
577 (IV)((data)->offset_float_max) \
578 ); \
579 PerlIO_printf(Perl_debug_log,"\n"); \
580});
581
acfe0abc 582static void clear_re(pTHX_ void *r);
4327152a 583
653099ff 584/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 585 Update the longest found anchored substring and the longest found
653099ff
GS
586 floating substrings if needed. */
587
4327152a 588STATIC void
1de06328 589S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 590{
e1ec3a88
AL
591 const STRLEN l = CHR_SVLEN(data->last_found);
592 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 593 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 594
c277df42 595 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 596 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
597 if (*data->longest == data->longest_fixed) {
598 data->offset_fixed = l ? data->last_start_min : data->pos_min;
599 if (data->flags & SF_BEFORE_EOL)
b81d288d 600 data->flags
c277df42
IZ
601 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
602 else
603 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
604 data->minlen_fixed=minlenp;
605 data->lookbehind_fixed=0;
a0ed51b3
LW
606 }
607 else {
c277df42 608 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
609 data->offset_float_max = (l
610 ? data->last_start_max
c277df42 611 : data->pos_min + data->pos_delta);
9051bda5
HS
612 if ((U32)data->offset_float_max > (U32)I32_MAX)
613 data->offset_float_max = I32_MAX;
c277df42 614 if (data->flags & SF_BEFORE_EOL)
b81d288d 615 data->flags
c277df42
IZ
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
619 data->minlen_float=minlenp;
620 data->lookbehind_float=0;
c277df42
IZ
621 }
622 }
623 SvCUR_set(data->last_found, 0);
0eda9292 624 {
a28509cc 625 SV * const sv = data->last_found;
097eb12c
AL
626 if (SvUTF8(sv) && SvMAGICAL(sv)) {
627 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
628 if (mg)
629 mg->mg_len = 0;
630 }
0eda9292 631 }
c277df42
IZ
632 data->last_end = -1;
633 data->flags &= ~SF_BEFORE_EOL;
1de06328 634 DEBUG_STUDYDATA(data,0);
c277df42
IZ
635}
636
653099ff
GS
637/* Can match anything (initialization) */
638STATIC void
097eb12c 639S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 640{
653099ff 641 ANYOF_CLASS_ZERO(cl);
f8bef550 642 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 643 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
644 if (LOC)
645 cl->flags |= ANYOF_LOCALE;
646}
647
648/* Can match anything (initialization) */
649STATIC int
5f66b61c 650S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
651{
652 int value;
653
aaa51d5e 654 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
655 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
656 return 1;
1aa99e6b
IH
657 if (!(cl->flags & ANYOF_UNICODE_ALL))
658 return 0;
10edeb5d 659 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 660 return 0;
653099ff
GS
661 return 1;
662}
663
664/* Can match anything (initialization) */
665STATIC void
097eb12c 666S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 667{
8ecf7187 668 Zero(cl, 1, struct regnode_charclass_class);
653099ff 669 cl->type = ANYOF;
830247a4 670 cl_anything(pRExC_state, cl);
653099ff
GS
671}
672
673STATIC void
097eb12c 674S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 675{
8ecf7187 676 Zero(cl, 1, struct regnode_charclass_class);
653099ff 677 cl->type = ANYOF;
830247a4 678 cl_anything(pRExC_state, cl);
653099ff
GS
679 if (LOC)
680 cl->flags |= ANYOF_LOCALE;
681}
682
683/* 'And' a given class with another one. Can create false positives */
684/* We assume that cl is not inverted */
685STATIC void
5f66b61c 686S_cl_and(struct regnode_charclass_class *cl,
a28509cc 687 const struct regnode_charclass_class *and_with)
653099ff 688{
40d049e4
YO
689
690 assert(and_with->type == ANYOF);
653099ff
GS
691 if (!(and_with->flags & ANYOF_CLASS)
692 && !(cl->flags & ANYOF_CLASS)
693 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
694 && !(and_with->flags & ANYOF_FOLD)
695 && !(cl->flags & ANYOF_FOLD)) {
696 int i;
697
698 if (and_with->flags & ANYOF_INVERT)
699 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
700 cl->bitmap[i] &= ~and_with->bitmap[i];
701 else
702 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
703 cl->bitmap[i] &= and_with->bitmap[i];
704 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
705 if (!(and_with->flags & ANYOF_EOS))
706 cl->flags &= ~ANYOF_EOS;
1aa99e6b 707
14ebb1a2
JH
708 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
709 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
710 cl->flags &= ~ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_UNICODE;
712 ARG_SET(cl, ARG(and_with));
713 }
14ebb1a2
JH
714 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
715 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 716 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
717 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
718 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 719 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
720}
721
722/* 'OR' a given class with another one. Can create false positives */
723/* We assume that cl is not inverted */
724STATIC void
097eb12c 725S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 726{
653099ff
GS
727 if (or_with->flags & ANYOF_INVERT) {
728 /* We do not use
729 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
730 * <= (B1 | !B2) | (CL1 | !CL2)
731 * which is wasteful if CL2 is small, but we ignore CL2:
732 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
733 * XXXX Can we handle case-fold? Unclear:
734 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
735 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
736 */
737 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
738 && !(or_with->flags & ANYOF_FOLD)
739 && !(cl->flags & ANYOF_FOLD) ) {
740 int i;
741
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] |= ~or_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise */
745 else {
830247a4 746 cl_anything(pRExC_state, cl);
653099ff
GS
747 }
748 } else {
749 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
750 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 751 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
752 || (cl->flags & ANYOF_FOLD)) ) {
753 int i;
754
755 /* OR char bitmap and class bitmap separately */
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= or_with->bitmap[i];
758 if (or_with->flags & ANYOF_CLASS) {
759 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
760 cl->classflags[i] |= or_with->classflags[i];
761 cl->flags |= ANYOF_CLASS;
762 }
763 }
764 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 765 cl_anything(pRExC_state, cl);
653099ff
GS
766 }
767 }
768 if (or_with->flags & ANYOF_EOS)
769 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
770
771 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
772 ARG(cl) != ARG(or_with)) {
773 cl->flags |= ANYOF_UNICODE_ALL;
774 cl->flags &= ~ANYOF_UNICODE;
775 }
776 if (or_with->flags & ANYOF_UNICODE_ALL) {
777 cl->flags |= ANYOF_UNICODE_ALL;
778 cl->flags &= ~ANYOF_UNICODE;
779 }
653099ff
GS
780}
781
a3621e74
YO
782#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
783#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
784#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
785#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
786
3dab1dad
YO
787
788#ifdef DEBUGGING
07be1b83 789/*
3dab1dad
YO
790 dump_trie(trie)
791 dump_trie_interim_list(trie,next_alloc)
792 dump_trie_interim_table(trie,next_alloc)
793
794 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
795 The _interim_ variants are used for debugging the interim
796 tables that are used to generate the final compressed
797 representation which is what dump_trie expects.
798
3dab1dad
YO
799 Part of the reason for their existance is to provide a form
800 of documentation as to how the different representations function.
07be1b83
YO
801
802*/
3dab1dad
YO
803
804/*
805 dump_trie(trie)
806 Dumps the final compressed table form of the trie to Perl_debug_log.
807 Used for debugging make_trie().
808*/
809
810STATIC void
811S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
812{
813 U32 state;
ab3bbdeb
YO
814 SV *sv=sv_newmortal();
815 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
816 GET_RE_DEBUG_FLAGS_DECL;
817
ab3bbdeb 818
3dab1dad
YO
819 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
820 (int)depth * 2 + 2,"",
821 "Match","Base","Ofs" );
822
823 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 824 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 825 if ( tmp ) {
ab3bbdeb
YO
826 PerlIO_printf( Perl_debug_log, "%*s",
827 colwidth,
ddc5bc0f 828 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
829 PL_colors[0], PL_colors[1],
830 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
831 PERL_PV_ESCAPE_FIRSTCHAR
832 )
833 );
3dab1dad
YO
834 }
835 }
836 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
837 (int)depth * 2 + 2,"");
838
839 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 840 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
841 PerlIO_printf( Perl_debug_log, "\n");
842
1e2e3d02 843 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 844 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
845
846 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
847
848 if ( trie->states[ state ].wordnum ) {
849 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
850 } else {
851 PerlIO_printf( Perl_debug_log, "%6s", "" );
852 }
853
854 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
855
856 if ( base ) {
857 U32 ofs = 0;
858
859 while( ( base + ofs < trie->uniquecharcount ) ||
860 ( base + ofs - trie->uniquecharcount < trie->lasttrans
861 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
862 ofs++;
863
864 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
865
866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
867 if ( ( base + ofs >= trie->uniquecharcount ) &&
868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
870 {
ab3bbdeb
YO
871 PerlIO_printf( Perl_debug_log, "%*"UVXf,
872 colwidth,
3dab1dad
YO
873 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
874 } else {
ab3bbdeb 875 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
876 }
877 }
878
879 PerlIO_printf( Perl_debug_log, "]");
880
881 }
882 PerlIO_printf( Perl_debug_log, "\n" );
883 }
884}
885/*
886 dump_trie_interim_list(trie,next_alloc)
887 Dumps a fully constructed but uncompressed trie in list form.
888 List tries normally only are used for construction when the number of
889 possible chars (trie->uniquecharcount) is very high.
890 Used for debugging make_trie().
891*/
892STATIC void
893S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
894{
895 U32 state;
ab3bbdeb
YO
896 SV *sv=sv_newmortal();
897 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
898 GET_RE_DEBUG_FLAGS_DECL;
899 /* print out the table precompression. */
ab3bbdeb
YO
900 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
901 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
902 "------:-----+-----------------\n" );
3dab1dad
YO
903
904 for( state=1 ; state < next_alloc ; state ++ ) {
905 U16 charid;
906
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
908 (int)depth * 2 + 2,"", (UV)state );
909 if ( ! trie->states[ state ].wordnum ) {
910 PerlIO_printf( Perl_debug_log, "%5s| ","");
911 } else {
912 PerlIO_printf( Perl_debug_log, "W%4x| ",
913 trie->states[ state ].wordnum
914 );
915 }
916 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 917 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
918 if ( tmp ) {
919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
920 colwidth,
ddc5bc0f 921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
922 PL_colors[0], PL_colors[1],
923 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
924 PERL_PV_ESCAPE_FIRSTCHAR
925 ) ,
1e2e3d02
YO
926 TRIE_LIST_ITEM(state,charid).forid,
927 (UV)TRIE_LIST_ITEM(state,charid).newstate
928 );
929 if (!(charid % 10))
664e119d
RGS
930 PerlIO_printf(Perl_debug_log, "\n%*s| ",
931 (int)((depth * 2) + 14), "");
1e2e3d02 932 }
ab3bbdeb
YO
933 }
934 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
935 }
936}
937
938/*
939 dump_trie_interim_table(trie,next_alloc)
940 Dumps a fully constructed but uncompressed trie in table form.
941 This is the normal DFA style state transition table, with a few
942 twists to facilitate compression later.
943 Used for debugging make_trie().
944*/
945STATIC void
946S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
947{
948 U32 state;
949 U16 charid;
ab3bbdeb
YO
950 SV *sv=sv_newmortal();
951 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
952 GET_RE_DEBUG_FLAGS_DECL;
953
954 /*
955 print out the table precompression so that we can do a visual check
956 that they are identical.
957 */
958
959 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
960
961 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 962 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 963 if ( tmp ) {
ab3bbdeb
YO
964 PerlIO_printf( Perl_debug_log, "%*s",
965 colwidth,
ddc5bc0f 966 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
967 PL_colors[0], PL_colors[1],
968 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
969 PERL_PV_ESCAPE_FIRSTCHAR
970 )
971 );
3dab1dad
YO
972 }
973 }
974
975 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
976
977 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 978 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
979 }
980
981 PerlIO_printf( Perl_debug_log, "\n" );
982
983 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
984
985 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
986 (int)depth * 2 + 2,"",
987 (UV)TRIE_NODENUM( state ) );
988
989 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
990 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
991 if (v)
992 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
993 else
994 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
995 }
996 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
998 } else {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1000 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1001 }
1002 }
07be1b83 1003}
3dab1dad
YO
1004
1005#endif
1006
786e8c11
YO
1007/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1008 startbranch: the first branch in the whole branch sequence
1009 first : start branch of sequence of branch-exact nodes.
1010 May be the same as startbranch
1011 last : Thing following the last branch.
1012 May be the same as tail.
1013 tail : item following the branch sequence
1014 count : words in the sequence
1015 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1016 depth : indent depth
3dab1dad 1017
786e8c11 1018Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1019
786e8c11
YO
1020A trie is an N'ary tree where the branches are determined by digital
1021decomposition of the key. IE, at the root node you look up the 1st character and
1022follow that branch repeat until you find the end of the branches. Nodes can be
1023marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1024
786e8c11 1025 /he|she|his|hers/
72f13be8 1026
786e8c11
YO
1027would convert into the following structure. Numbers represent states, letters
1028following numbers represent valid transitions on the letter from that state, if
1029the number is in square brackets it represents an accepting state, otherwise it
1030will be in parenthesis.
07be1b83 1031
786e8c11
YO
1032 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1033 | |
1034 | (2)
1035 | |
1036 (1) +-i->(6)-+-s->[7]
1037 |
1038 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1039
786e8c11
YO
1040 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1041
1042This shows that when matching against the string 'hers' we will begin at state 1
1043read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1044then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1045is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1046single traverse. We store a mapping from accepting to state to which word was
1047matched, and then when we have multiple possibilities we try to complete the
1048rest of the regex in the order in which they occured in the alternation.
1049
1050The only prior NFA like behaviour that would be changed by the TRIE support is
1051the silent ignoring of duplicate alternations which are of the form:
1052
1053 / (DUPE|DUPE) X? (?{ ... }) Y /x
1054
1055Thus EVAL blocks follwing a trie may be called a different number of times with
1056and without the optimisation. With the optimisations dupes will be silently
1057ignored. This inconsistant behaviour of EVAL type nodes is well established as
1058the following demonstrates:
1059
1060 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1061
1062which prints out 'word' three times, but
1063
1064 'words'=~/(word|word|word)(?{ print $1 })S/
1065
1066which doesnt print it out at all. This is due to other optimisations kicking in.
1067
1068Example of what happens on a structural level:
1069
1070The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1071
1072 1: CURLYM[1] {1,32767}(18)
1073 5: BRANCH(8)
1074 6: EXACT <ac>(16)
1075 8: BRANCH(11)
1076 9: EXACT <ad>(16)
1077 11: BRANCH(14)
1078 12: EXACT <ab>(16)
1079 16: SUCCEED(0)
1080 17: NOTHING(18)
1081 18: END(0)
1082
1083This would be optimizable with startbranch=5, first=5, last=16, tail=16
1084and should turn into:
1085
1086 1: CURLYM[1] {1,32767}(18)
1087 5: TRIE(16)
1088 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1089 <ac>
1090 <ad>
1091 <ab>
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096Cases where tail != last would be like /(?foo|bar)baz/:
1097
1098 1: BRANCH(4)
1099 2: EXACT <foo>(8)
1100 4: BRANCH(7)
1101 5: EXACT <bar>(8)
1102 7: TAIL(8)
1103 8: EXACT <baz>(10)
1104 10: END(0)
1105
1106which would be optimizable with startbranch=1, first=1, last=7, tail=8
1107and would end up looking like:
1108
1109 1: TRIE(8)
1110 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1111 <foo>
1112 <bar>
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117 d = uvuni_to_utf8_flags(d, uv, 0);
1118
1119is the recommended Unicode-aware way of saying
1120
1121 *(d++) = uv;
1122*/
1123
1e2e3d02 1124#define TRIE_STORE_REVCHAR \
786e8c11 1125 STMT_START { \
1e2e3d02 1126 SV *tmp = newSVpvs(""); \
786e8c11 1127 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1128 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1129 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1130 } STMT_END
1131
1132#define TRIE_READ_CHAR STMT_START { \
1133 wordlen++; \
1134 if ( UTF ) { \
1135 if ( folder ) { \
1136 if ( foldlen > 0 ) { \
1137 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1138 foldlen -= len; \
1139 scan += len; \
1140 len = 0; \
1141 } else { \
1142 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1143 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1144 foldlen -= UNISKIP( uvc ); \
1145 scan = foldbuf + UNISKIP( uvc ); \
1146 } \
1147 } else { \
1148 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1149 } \
1150 } else { \
1151 uvc = (U32)*uc; \
1152 len = 1; \
1153 } \
1154} STMT_END
1155
1156
1157
1158#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1159 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1160 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1161 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1162 } \
1163 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1164 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1165 TRIE_LIST_CUR( state )++; \
1166} STMT_END
07be1b83 1167
786e8c11
YO
1168#define TRIE_LIST_NEW(state) STMT_START { \
1169 Newxz( trie->states[ state ].trans.list, \
1170 4, reg_trie_trans_le ); \
1171 TRIE_LIST_CUR( state ) = 1; \
1172 TRIE_LIST_LEN( state ) = 4; \
1173} STMT_END
07be1b83 1174
786e8c11
YO
1175#define TRIE_HANDLE_WORD(state) STMT_START { \
1176 U16 dupe= trie->states[ state ].wordnum; \
1177 regnode * const noper_next = regnext( noper ); \
1178 \
1179 if (trie->wordlen) \
1180 trie->wordlen[ curword ] = wordlen; \
1181 DEBUG_r({ \
1182 /* store the word for dumping */ \
1183 SV* tmp; \
1184 if (OP(noper) != NOTHING) \
1185 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1186 else \
1187 tmp = newSVpvn( "", 0 ); \
1188 if ( UTF ) SvUTF8_on( tmp ); \
1189 av_push( trie->words, tmp ); \
1190 }); \
1191 \
1192 curword++; \
1193 \
1194 if ( noper_next < tail ) { \
1195 if (!trie->jump) \
1196 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1197 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1198 if (!jumper) \
1199 jumper = noper_next; \
1200 if (!nextbranch) \
1201 nextbranch= regnext(cur); \
1202 } \
1203 \
1204 if ( dupe ) { \
1205 /* So it's a dupe. This means we need to maintain a */\
1206 /* linked-list from the first to the next. */\
1207 /* we only allocate the nextword buffer when there */\
1208 /* a dupe, so first time we have to do the allocation */\
1209 if (!trie->nextword) \
1210 Newxz( trie->nextword, word_count + 1, U16); \
1211 while ( trie->nextword[dupe] ) \
1212 dupe= trie->nextword[dupe]; \
1213 trie->nextword[dupe]= curword; \
1214 } else { \
1215 /* we haven't inserted this word yet. */ \
1216 trie->states[ state ].wordnum = curword; \
1217 } \
1218} STMT_END
07be1b83 1219
3dab1dad 1220
786e8c11
YO
1221#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1222 ( ( base + charid >= ucharcount \
1223 && base + charid < ubound \
1224 && state == trie->trans[ base - ucharcount + charid ].check \
1225 && trie->trans[ base - ucharcount + charid ].next ) \
1226 ? trie->trans[ base - ucharcount + charid ].next \
1227 : ( state==1 ? special : 0 ) \
1228 )
3dab1dad 1229
786e8c11
YO
1230#define MADE_TRIE 1
1231#define MADE_JUMP_TRIE 2
1232#define MADE_EXACT_TRIE 4
3dab1dad 1233
a3621e74 1234STATIC I32
786e8c11 1235S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1236{
27da23d5 1237 dVAR;
a3621e74
YO
1238 /* first pass, loop through and scan words */
1239 reg_trie_data *trie;
1240 regnode *cur;
9f7f3913 1241 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1242 STRLEN len = 0;
1243 UV uvc = 0;
1244 U16 curword = 0;
1245 U32 next_alloc = 0;
786e8c11
YO
1246 regnode *jumper = NULL;
1247 regnode *nextbranch = NULL;
7f69552c 1248 regnode *convert = NULL;
a3621e74 1249 /* we just use folder as a flag in utf8 */
e1ec3a88 1250 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1251 ? PL_fold
1252 : ( flags == EXACTFL
1253 ? PL_fold_locale
1254 : NULL
1255 )
1256 );
1257
e1ec3a88 1258 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1259 SV *re_trie_maxbuff;
3dab1dad
YO
1260#ifndef DEBUGGING
1261 /* these are only used during construction but are useful during
8e11feef 1262 * debugging so we store them in the struct when debugging.
8e11feef 1263 */
3dab1dad 1264 STRLEN trie_charcount=0;
3dab1dad
YO
1265 AV *trie_revcharmap;
1266#endif
a3621e74 1267 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1268#ifndef DEBUGGING
1269 PERL_UNUSED_ARG(depth);
1270#endif
a3621e74 1271
a02a5408 1272 Newxz( trie, 1, reg_trie_data );
a3621e74 1273 trie->refcount = 1;
3dab1dad 1274 trie->startstate = 1;
786e8c11 1275 trie->wordcount = word_count;
a3621e74 1276 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1277 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1278 if (!(UTF && folder))
1279 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1280 DEBUG_r({
1281 trie->words = newAV();
a3621e74 1282 });
3dab1dad 1283 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1284
0111c4fd 1285 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1286 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1287 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1288 }
3dab1dad
YO
1289 DEBUG_OPTIMISE_r({
1290 PerlIO_printf( Perl_debug_log,
786e8c11 1291 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1292 (int)depth * 2 + 2, "",
1293 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1294 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1295 (int)depth);
3dab1dad 1296 });
7f69552c
YO
1297
1298 /* Find the node we are going to overwrite */
1299 if ( first == startbranch && OP( last ) != BRANCH ) {
1300 /* whole branch chain */
1301 convert = first;
1302 } else {
1303 /* branch sub-chain */
1304 convert = NEXTOPER( first );
1305 }
1306
a3621e74
YO
1307 /* -- First loop and Setup --
1308
1309 We first traverse the branches and scan each word to determine if it
1310 contains widechars, and how many unique chars there are, this is
1311 important as we have to build a table with at least as many columns as we
1312 have unique chars.
1313
1314 We use an array of integers to represent the character codes 0..255
1315 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1316 native representation of the character value as the key and IV's for the
1317 coded index.
1318
1319 *TODO* If we keep track of how many times each character is used we can
1320 remap the columns so that the table compression later on is more
1321 efficient in terms of memory by ensuring most common value is in the
1322 middle and the least common are on the outside. IMO this would be better
1323 than a most to least common mapping as theres a decent chance the most
1324 common letter will share a node with the least common, meaning the node
1325 will not be compressable. With a middle is most common approach the worst
1326 case is when we have the least common nodes twice.
1327
1328 */
1329
a3621e74 1330 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1331 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1332 const U8 *uc = (U8*)STRING( noper );
a28509cc 1333 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1334 STRLEN foldlen = 0;
1335 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1336 const U8 *scan = (U8*)NULL;
07be1b83 1337 U32 wordlen = 0; /* required init */
3dab1dad 1338 STRLEN chars=0;
a3621e74 1339
3dab1dad
YO
1340 if (OP(noper) == NOTHING) {
1341 trie->minlen= 0;
1342 continue;
1343 }
1344 if (trie->bitmap) {
1345 TRIE_BITMAP_SET(trie,*uc);
1346 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1347 }
a3621e74 1348 for ( ; uc < e ; uc += len ) {
3dab1dad 1349 TRIE_CHARCOUNT(trie)++;
a3621e74 1350 TRIE_READ_CHAR;
3dab1dad 1351 chars++;
a3621e74
YO
1352 if ( uvc < 256 ) {
1353 if ( !trie->charmap[ uvc ] ) {
1354 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1355 if ( folder )
1356 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1357 TRIE_STORE_REVCHAR;
a3621e74
YO
1358 }
1359 } else {
1360 SV** svpp;
1361 if ( !trie->widecharmap )
1362 trie->widecharmap = newHV();
1363
1364 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1365
1366 if ( !svpp )
e4584336 1367 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1368
1369 if ( !SvTRUE( *svpp ) ) {
1370 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1371 TRIE_STORE_REVCHAR;
a3621e74
YO
1372 }
1373 }
1374 }
3dab1dad
YO
1375 if( cur == first ) {
1376 trie->minlen=chars;
1377 trie->maxlen=chars;
1378 } else if (chars < trie->minlen) {
1379 trie->minlen=chars;
1380 } else if (chars > trie->maxlen) {
1381 trie->maxlen=chars;
1382 }
1383
a3621e74
YO
1384 } /* end first pass */
1385 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1386 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1387 (int)depth * 2 + 2,"",
85c3142d 1388 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1390 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1391 );
786e8c11 1392 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1393
1394 /*
1395 We now know what we are dealing with in terms of unique chars and
1396 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1397 representation using a flat table will take. If it's over a reasonable
1398 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1399 conservative but potentially much slower representation using an array
1400 of lists.
1401
1402 At the end we convert both representations into the same compressed
1403 form that will be used in regexec.c for matching with. The latter
1404 is a form that cannot be used to construct with but has memory
1405 properties similar to the list form and access properties similar
1406 to the table form making it both suitable for fast searches and
1407 small enough that its feasable to store for the duration of a program.
1408
1409 See the comment in the code where the compressed table is produced
1410 inplace from the flat tabe representation for an explanation of how
1411 the compression works.
1412
1413 */
1414
1415
3dab1dad 1416 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1417 /*
1418 Second Pass -- Array Of Lists Representation
1419
1420 Each state will be represented by a list of charid:state records
1421 (reg_trie_trans_le) the first such element holds the CUR and LEN
1422 points of the allocated array. (See defines above).
1423
1424 We build the initial structure using the lists, and then convert
1425 it into the compressed table form which allows faster lookups
1426 (but cant be modified once converted).
a3621e74
YO
1427 */
1428
a3621e74
YO
1429 STRLEN transcount = 1;
1430
1e2e3d02
YO
1431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1432 "%*sCompiling trie using list compiler\n",
1433 (int)depth * 2 + 2, ""));
1434
3dab1dad 1435 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1436 TRIE_LIST_NEW(1);
1437 next_alloc = 2;
1438
1439 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1440
c445ea15
AL
1441 regnode * const noper = NEXTOPER( cur );
1442 U8 *uc = (U8*)STRING( noper );
1443 const U8 * const e = uc + STR_LEN( noper );
1444 U32 state = 1; /* required init */
1445 U16 charid = 0; /* sanity init */
1446 U8 *scan = (U8*)NULL; /* sanity init */
1447 STRLEN foldlen = 0; /* required init */
07be1b83 1448 U32 wordlen = 0; /* required init */
c445ea15
AL
1449 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1450
3dab1dad 1451 if (OP(noper) != NOTHING) {
786e8c11 1452 for ( ; uc < e ; uc += len ) {
c445ea15 1453
786e8c11 1454 TRIE_READ_CHAR;
c445ea15 1455
786e8c11
YO
1456 if ( uvc < 256 ) {
1457 charid = trie->charmap[ uvc ];
c445ea15 1458 } else {
786e8c11
YO
1459 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1460 if ( !svpp ) {
1461 charid = 0;
1462 } else {
1463 charid=(U16)SvIV( *svpp );
1464 }
c445ea15 1465 }
786e8c11
YO
1466 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1467 if ( charid ) {
a3621e74 1468
786e8c11
YO
1469 U16 check;
1470 U32 newstate = 0;
a3621e74 1471
786e8c11
YO
1472 charid--;
1473 if ( !trie->states[ state ].trans.list ) {
1474 TRIE_LIST_NEW( state );
c445ea15 1475 }
786e8c11
YO
1476 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1477 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1478 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1479 break;
1480 }
1481 }
1482 if ( ! newstate ) {
1483 newstate = next_alloc++;
1484 TRIE_LIST_PUSH( state, charid, newstate );
1485 transcount++;
1486 }
1487 state = newstate;
1488 } else {
1489 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1490 }
a28509cc 1491 }
c445ea15 1492 }
3dab1dad 1493 TRIE_HANDLE_WORD(state);
a3621e74
YO
1494
1495 } /* end second pass */
1496
1e2e3d02
YO
1497 /* next alloc is the NEXT state to be allocated */
1498 trie->statecount = next_alloc;
a3621e74
YO
1499 Renew( trie->states, next_alloc, reg_trie_state );
1500
3dab1dad
YO
1501 /* and now dump it out before we compress it */
1502 DEBUG_TRIE_COMPILE_MORE_r(
1503 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1504 );
a3621e74 1505
a02a5408 1506 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1507 {
1508 U32 state;
a3621e74
YO
1509 U32 tp = 0;
1510 U32 zp = 0;
1511
1512
1513 for( state=1 ; state < next_alloc ; state ++ ) {
1514 U32 base=0;
1515
1516 /*
1517 DEBUG_TRIE_COMPILE_MORE_r(
1518 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1519 );
1520 */
1521
1522 if (trie->states[state].trans.list) {
1523 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1524 U16 maxid=minid;
a28509cc 1525 U16 idx;
a3621e74
YO
1526
1527 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1528 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1529 if ( forid < minid ) {
1530 minid=forid;
1531 } else if ( forid > maxid ) {
1532 maxid=forid;
1533 }
a3621e74
YO
1534 }
1535 if ( transcount < tp + maxid - minid + 1) {
1536 transcount *= 2;
1537 Renew( trie->trans, transcount, reg_trie_trans );
1538 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1539 }
1540 base = trie->uniquecharcount + tp - minid;
1541 if ( maxid == minid ) {
1542 U32 set = 0;
1543 for ( ; zp < tp ; zp++ ) {
1544 if ( ! trie->trans[ zp ].next ) {
1545 base = trie->uniquecharcount + zp - minid;
1546 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1547 trie->trans[ zp ].check = state;
1548 set = 1;
1549 break;
1550 }
1551 }
1552 if ( !set ) {
1553 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1554 trie->trans[ tp ].check = state;
1555 tp++;
1556 zp = tp;
1557 }
1558 } else {
1559 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1560 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1561 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1562 trie->trans[ tid ].check = state;
1563 }
1564 tp += ( maxid - minid + 1 );
1565 }
1566 Safefree(trie->states[ state ].trans.list);
1567 }
1568 /*
1569 DEBUG_TRIE_COMPILE_MORE_r(
1570 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1571 );
1572 */
1573 trie->states[ state ].trans.base=base;
1574 }
cc601c31 1575 trie->lasttrans = tp + 1;
a3621e74
YO
1576 }
1577 } else {
1578 /*
1579 Second Pass -- Flat Table Representation.
1580
1581 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1582 We know that we will need Charcount+1 trans at most to store the data
1583 (one row per char at worst case) So we preallocate both structures
1584 assuming worst case.
1585
1586 We then construct the trie using only the .next slots of the entry
1587 structs.
1588
1589 We use the .check field of the first entry of the node temporarily to
1590 make compression both faster and easier by keeping track of how many non
1591 zero fields are in the node.
1592
1593 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1594 transition.
1595
1596 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1597 number representing the first entry of the node, and state as a
1598 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1599 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1600 are 2 entrys per node. eg:
1601
1602 A B A B
1603 1. 2 4 1. 3 7
1604 2. 0 3 3. 0 5
1605 3. 0 0 5. 0 0
1606 4. 0 0 7. 0 0
1607
1608 The table is internally in the right hand, idx form. However as we also
1609 have to deal with the states array which is indexed by nodenum we have to
1610 use TRIE_NODENUM() to convert.
1611
1612 */
1e2e3d02
YO
1613 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1614 "%*sCompiling trie using table compiler\n",
1615 (int)depth * 2 + 2, ""));
3dab1dad
YO
1616
1617 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1618 reg_trie_trans );
3dab1dad 1619 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1620 next_alloc = trie->uniquecharcount + 1;
1621
3dab1dad 1622
a3621e74
YO
1623 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1624
c445ea15 1625 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1626 const U8 *uc = (U8*)STRING( noper );
1627 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1628
1629 U32 state = 1; /* required init */
1630
1631 U16 charid = 0; /* sanity init */
1632 U32 accept_state = 0; /* sanity init */
1633 U8 *scan = (U8*)NULL; /* sanity init */
1634
1635 STRLEN foldlen = 0; /* required init */
07be1b83 1636 U32 wordlen = 0; /* required init */
a3621e74
YO
1637 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1638
3dab1dad 1639 if ( OP(noper) != NOTHING ) {
786e8c11 1640 for ( ; uc < e ; uc += len ) {
a3621e74 1641
786e8c11 1642 TRIE_READ_CHAR;
a3621e74 1643
786e8c11
YO
1644 if ( uvc < 256 ) {
1645 charid = trie->charmap[ uvc ];
1646 } else {
1647 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1648 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1649 }
786e8c11
YO
1650 if ( charid ) {
1651 charid--;
1652 if ( !trie->trans[ state + charid ].next ) {
1653 trie->trans[ state + charid ].next = next_alloc;
1654 trie->trans[ state ].check++;
1655 next_alloc += trie->uniquecharcount;
1656 }
1657 state = trie->trans[ state + charid ].next;
1658 } else {
1659 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1660 }
1661 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1662 }
a3621e74 1663 }
3dab1dad
YO
1664 accept_state = TRIE_NODENUM( state );
1665 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1666
1667 } /* end second pass */
1668
3dab1dad
YO
1669 /* and now dump it out before we compress it */
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 dump_trie_interim_table(trie,next_alloc,depth+1)
1672 );
a3621e74 1673
a3621e74
YO
1674 {
1675 /*
1676 * Inplace compress the table.*
1677
1678 For sparse data sets the table constructed by the trie algorithm will
1679 be mostly 0/FAIL transitions or to put it another way mostly empty.
1680 (Note that leaf nodes will not contain any transitions.)
1681
1682 This algorithm compresses the tables by eliminating most such
1683 transitions, at the cost of a modest bit of extra work during lookup:
1684
1685 - Each states[] entry contains a .base field which indicates the
1686 index in the state[] array wheres its transition data is stored.
1687
1688 - If .base is 0 there are no valid transitions from that node.
1689
1690 - If .base is nonzero then charid is added to it to find an entry in
1691 the trans array.
1692
1693 -If trans[states[state].base+charid].check!=state then the
1694 transition is taken to be a 0/Fail transition. Thus if there are fail
1695 transitions at the front of the node then the .base offset will point
1696 somewhere inside the previous nodes data (or maybe even into a node
1697 even earlier), but the .check field determines if the transition is
1698 valid.
1699
786e8c11 1700 XXX - wrong maybe?
a3621e74
YO
1701 The following process inplace converts the table to the compressed
1702 table: We first do not compress the root node 1,and mark its all its
1703 .check pointers as 1 and set its .base pointer as 1 as well. This
1704 allows to do a DFA construction from the compressed table later, and
1705 ensures that any .base pointers we calculate later are greater than
1706 0.
1707
1708 - We set 'pos' to indicate the first entry of the second node.
1709
1710 - We then iterate over the columns of the node, finding the first and
1711 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1712 and set the .check pointers accordingly, and advance pos
1713 appropriately and repreat for the next node. Note that when we copy
1714 the next pointers we have to convert them from the original
1715 NODEIDX form to NODENUM form as the former is not valid post
1716 compression.
1717
1718 - If a node has no transitions used we mark its base as 0 and do not
1719 advance the pos pointer.
1720
1721 - If a node only has one transition we use a second pointer into the
1722 structure to fill in allocated fail transitions from other states.
1723 This pointer is independent of the main pointer and scans forward
1724 looking for null transitions that are allocated to a state. When it
1725 finds one it writes the single transition into the "hole". If the
786e8c11 1726 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1727
1728 - Once compressed we can Renew/realloc the structures to release the
1729 excess space.
1730
1731 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1732 specifically Fig 3.47 and the associated pseudocode.
1733
1734 demq
1735 */
a3b680e6 1736 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1737 U32 state, charid;
a3621e74 1738 U32 pos = 0, zp=0;
1e2e3d02 1739 trie->statecount = laststate;
a3621e74
YO
1740
1741 for ( state = 1 ; state < laststate ; state++ ) {
1742 U8 flag = 0;
a28509cc
AL
1743 const U32 stateidx = TRIE_NODEIDX( state );
1744 const U32 o_used = trie->trans[ stateidx ].check;
1745 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1746 trie->trans[ stateidx ].check = 0;
1747
1748 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1749 if ( flag || trie->trans[ stateidx + charid ].next ) {
1750 if ( trie->trans[ stateidx + charid ].next ) {
1751 if (o_used == 1) {
1752 for ( ; zp < pos ; zp++ ) {
1753 if ( ! trie->trans[ zp ].next ) {
1754 break;
1755 }
1756 }
1757 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1758 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1759 trie->trans[ zp ].check = state;
1760 if ( ++zp > pos ) pos = zp;
1761 break;
1762 }
1763 used--;
1764 }
1765 if ( !flag ) {
1766 flag = 1;
1767 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1768 }
1769 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ pos ].check = state;
1771 pos++;
1772 }
1773 }
1774 }
cc601c31 1775 trie->lasttrans = pos + 1;
1e2e3d02 1776 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1777 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1778 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1779 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1780 (int)depth * 2 + 2,"",
1781 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1782 (IV)next_alloc,
1783 (IV)pos,
a3621e74
YO
1784 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1785 );
1786
1787 } /* end table compress */
1788 }
1e2e3d02
YO
1789 DEBUG_TRIE_COMPILE_MORE_r(
1790 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1791 (int)depth * 2 + 2, "",
1792 (UV)trie->statecount,
1793 (UV)trie->lasttrans)
1794 );
cc601c31
YO
1795 /* resize the trans array to remove unused space */
1796 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1797
3dab1dad
YO
1798 /* and now dump out the compressed format */
1799 DEBUG_TRIE_COMPILE_r(
1800 dump_trie(trie,depth+1)
1801 );
07be1b83 1802
3dab1dad 1803 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1804 U8 nodetype =(U8)(flags & 0xFF);
1805 char *str=NULL;
786e8c11 1806
07be1b83 1807#ifdef DEBUGGING
e62cc96a 1808 regnode *optimize = NULL;
b57a0404
JH
1809 U32 mjd_offset = 0;
1810 U32 mjd_nodelen = 0;
07be1b83 1811#endif
a3621e74 1812 /*
3dab1dad
YO
1813 This means we convert either the first branch or the first Exact,
1814 depending on whether the thing following (in 'last') is a branch
1815 or not and whther first is the startbranch (ie is it a sub part of
1816 the alternation or is it the whole thing.)
1817 Assuming its a sub part we conver the EXACT otherwise we convert
1818 the whole branch sequence, including the first.
a3621e74 1819 */
3dab1dad 1820 /* Find the node we are going to overwrite */
7f69552c 1821 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1822 /* branch sub-chain */
3dab1dad 1823 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1824 DEBUG_r({
1825 mjd_offset= Node_Offset((convert));
1826 mjd_nodelen= Node_Length((convert));
1827 });
7f69552c
YO
1828 /* whole branch chain */
1829 } else {
1830 DEBUG_r({
1831 const regnode *nop = NEXTOPER( convert );
1832 mjd_offset= Node_Offset((nop));
1833 mjd_nodelen= Node_Length((nop));
1834 });
07be1b83 1835 }
7f69552c 1836
07be1b83
YO
1837 DEBUG_OPTIMISE_r(
1838 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1839 (int)depth * 2 + 2, "",
786e8c11 1840 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1841 );
a3621e74 1842
3dab1dad
YO
1843 /* But first we check to see if there is a common prefix we can
1844 split out as an EXACT and put in front of the TRIE node. */
1845 trie->startstate= 1;
786e8c11 1846 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1847 U32 state;
1e2e3d02 1848 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1849 U32 ofs = 0;
8e11feef
RGS
1850 I32 idx = -1;
1851 U32 count = 0;
1852 const U32 base = trie->states[ state ].trans.base;
a3621e74 1853
3dab1dad 1854 if ( trie->states[state].wordnum )
8e11feef 1855 count = 1;
a3621e74 1856
8e11feef 1857 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1858 if ( ( base + ofs >= trie->uniquecharcount ) &&
1859 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1860 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1861 {
3dab1dad 1862 if ( ++count > 1 ) {
8e11feef 1863 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1864 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1865 if ( state == 1 ) break;
3dab1dad
YO
1866 if ( count == 2 ) {
1867 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1868 DEBUG_OPTIMISE_r(
8e11feef
RGS
1869 PerlIO_printf(Perl_debug_log,
1870 "%*sNew Start State=%"UVuf" Class: [",
1871 (int)depth * 2 + 2, "",
786e8c11 1872 (UV)state));
be8e71aa
YO
1873 if (idx >= 0) {
1874 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1875 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1876
3dab1dad 1877 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1878 if ( folder )
1879 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1880 DEBUG_OPTIMISE_r(
07be1b83 1881 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1882 );
8e11feef
RGS
1883 }
1884 }
1885 TRIE_BITMAP_SET(trie,*ch);
1886 if ( folder )
1887 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1888 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1889 }
1890 idx = ofs;
1891 }
3dab1dad
YO
1892 }
1893 if ( count == 1 ) {
1894 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1895 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1896 DEBUG_OPTIMISE_r(
8e11feef
RGS
1897 PerlIO_printf( Perl_debug_log,
1898 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1899 (int)depth * 2 + 2, "",
786e8c11 1900 (UV)state, (UV)idx, ch)
3dab1dad
YO
1901 );
1902 if ( state==1 ) {
1903 OP( convert ) = nodetype;
1904 str=STRING(convert);
1905 STR_LEN(convert)=0;
1906 }
1907 *str++=*ch;
1908 STR_LEN(convert)++;
a3621e74 1909
8e11feef 1910 } else {
f9049ba1 1911#ifdef DEBUGGING
8e11feef
RGS
1912 if (state>1)
1913 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1914#endif
8e11feef
RGS
1915 break;
1916 }
1917 }
3dab1dad 1918 if (str) {
8e11feef 1919 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1920 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1921 trie->startstate = state;
07be1b83
YO
1922 trie->minlen -= (state - 1);
1923 trie->maxlen -= (state - 1);
1924 DEBUG_r({
1925 regnode *fix = convert;
1926 mjd_nodelen++;
1927 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1928 while( ++fix < n ) {
1929 Set_Node_Offset_Length(fix, 0, 0);
1930 }
1931 });
8e11feef
RGS
1932 if (trie->maxlen) {
1933 convert = n;
1934 } else {
3dab1dad 1935 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1936 DEBUG_r(optimize= n);
3dab1dad
YO
1937 }
1938 }
1939 }
a5ca303d
YO
1940 if (!jumper)
1941 jumper = last;
3dab1dad 1942 if ( trie->maxlen ) {
8e11feef
RGS
1943 NEXT_OFF( convert ) = (U16)(tail - convert);
1944 ARG_SET( convert, data_slot );
786e8c11
YO
1945 /* Store the offset to the first unabsorbed branch in
1946 jump[0], which is otherwise unused by the jump logic.
1947 We use this when dumping a trie and during optimisation. */
1948 if (trie->jump)
7f69552c 1949 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1950
786e8c11
YO
1951 /* XXXX */
1952 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1953 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1954 {
1955 OP( convert ) = TRIEC;
1956 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1957 Safefree(trie->bitmap);
1958 trie->bitmap= NULL;
1959 } else
1960 OP( convert ) = TRIE;
a3621e74 1961
3dab1dad
YO
1962 /* store the type in the flags */
1963 convert->flags = nodetype;
a5ca303d
YO
1964 DEBUG_r({
1965 optimize = convert
1966 + NODE_STEP_REGNODE
1967 + regarglen[ OP( convert ) ];
1968 });
1969 /* XXX We really should free up the resource in trie now,
1970 as we won't use them - (which resources?) dmq */
3dab1dad 1971 }
a3621e74 1972 /* needed for dumping*/
e62cc96a 1973 DEBUG_r(if (optimize) {
07be1b83 1974 regnode *opt = convert;
e62cc96a 1975 while ( ++opt < optimize) {
07be1b83
YO
1976 Set_Node_Offset_Length(opt,0,0);
1977 }
786e8c11
YO
1978 /*
1979 Try to clean up some of the debris left after the
1980 optimisation.
a3621e74 1981 */
786e8c11 1982 while( optimize < jumper ) {
07be1b83 1983 mjd_nodelen += Node_Length((optimize));
a3621e74 1984 OP( optimize ) = OPTIMIZED;
07be1b83 1985 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1986 optimize++;
1987 }
07be1b83 1988 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1989 });
1990 } /* end node insert */
07be1b83 1991#ifndef DEBUGGING
6e8b4190 1992 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1993#endif
786e8c11
YO
1994 return trie->jump
1995 ? MADE_JUMP_TRIE
1996 : trie->startstate>1
1997 ? MADE_EXACT_TRIE
1998 : MADE_TRIE;
1999}
2000
2001STATIC void
2002S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2003{
2004/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2005
2006 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2007 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2008 ISBN 0-201-10088-6
2009
2010 We find the fail state for each state in the trie, this state is the longest proper
2011 suffix of the current states 'word' that is also a proper prefix of another word in our
2012 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2013 the DFA not to have to restart after its tried and failed a word at a given point, it
2014 simply continues as though it had been matching the other word in the first place.
2015 Consider
2016 'abcdgu'=~/abcdefg|cdgu/
2017 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2018 fail, which would bring use to the state representing 'd' in the second word where we would
2019 try 'g' and succeed, prodceding to match 'cdgu'.
2020 */
2021 /* add a fail transition */
2022 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2023 U32 *q;
2024 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2025 const U32 numstates = trie->statecount;
786e8c11
YO
2026 const U32 ubound = trie->lasttrans + ucharcount;
2027 U32 q_read = 0;
2028 U32 q_write = 0;
2029 U32 charid;
2030 U32 base = trie->states[ 1 ].trans.base;
2031 U32 *fail;
2032 reg_ac_data *aho;
2033 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2034 GET_RE_DEBUG_FLAGS_DECL;
2035#ifndef DEBUGGING
2036 PERL_UNUSED_ARG(depth);
2037#endif
2038
2039
2040 ARG_SET( stclass, data_slot );
2041 Newxz( aho, 1, reg_ac_data );
2042 RExC_rx->data->data[ data_slot ] = (void*)aho;
2043 aho->trie=trie;
2044 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2045 numstates * sizeof(reg_trie_state));
786e8c11
YO
2046 Newxz( q, numstates, U32);
2047 Newxz( aho->fail, numstates, U32 );
2048 aho->refcount = 1;
2049 fail = aho->fail;
2050 /* initialize fail[0..1] to be 1 so that we always have
2051 a valid final fail state */
2052 fail[ 0 ] = fail[ 1 ] = 1;
2053
2054 for ( charid = 0; charid < ucharcount ; charid++ ) {
2055 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2056 if ( newstate ) {
2057 q[ q_write ] = newstate;
2058 /* set to point at the root */
2059 fail[ q[ q_write++ ] ]=1;
2060 }
2061 }
2062 while ( q_read < q_write) {
2063 const U32 cur = q[ q_read++ % numstates ];
2064 base = trie->states[ cur ].trans.base;
2065
2066 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2067 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2068 if (ch_state) {
2069 U32 fail_state = cur;
2070 U32 fail_base;
2071 do {
2072 fail_state = fail[ fail_state ];
2073 fail_base = aho->states[ fail_state ].trans.base;
2074 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2075
2076 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2077 fail[ ch_state ] = fail_state;
2078 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2079 {
2080 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2081 }
2082 q[ q_write++ % numstates] = ch_state;
2083 }
2084 }
2085 }
2086 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2087 when we fail in state 1, this allows us to use the
2088 charclass scan to find a valid start char. This is based on the principle
2089 that theres a good chance the string being searched contains lots of stuff
2090 that cant be a start char.
2091 */
2092 fail[ 0 ] = fail[ 1 ] = 0;
2093 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2094 PerlIO_printf(Perl_debug_log,
2095 "%*sStclass Failtable (%"UVuf" states): 0",
2096 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2097 );
786e8c11
YO
2098 for( q_read=1; q_read<numstates; q_read++ ) {
2099 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2100 }
2101 PerlIO_printf(Perl_debug_log, "\n");
2102 });
2103 Safefree(q);
2104 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2105}
2106
786e8c11 2107
a3621e74 2108/*
5d1c421c
JH
2109 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2110 * These need to be revisited when a newer toolchain becomes available.
2111 */
2112#if defined(__sparc64__) && defined(__GNUC__)
2113# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2114# undef SPARC64_GCC_WORKAROUND
2115# define SPARC64_GCC_WORKAROUND 1
2116# endif
2117#endif
2118
07be1b83
YO
2119#define DEBUG_PEEP(str,scan,depth) \
2120 DEBUG_OPTIMISE_r({ \
2121 SV * const mysv=sv_newmortal(); \
2122 regnode *Next = regnext(scan); \
2123 regprop(RExC_rx, mysv, scan); \
7f69552c 2124 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2125 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2126 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2127 });
2128
1de06328
YO
2129
2130
2131
2132
07be1b83
YO
2133#define JOIN_EXACT(scan,min,flags) \
2134 if (PL_regkind[OP(scan)] == EXACT) \
2135 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2136
be8e71aa 2137STATIC U32
07be1b83
YO
2138S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2139 /* Merge several consecutive EXACTish nodes into one. */
2140 regnode *n = regnext(scan);
2141 U32 stringok = 1;
2142 regnode *next = scan + NODE_SZ_STR(scan);
2143 U32 merged = 0;
2144 U32 stopnow = 0;
2145#ifdef DEBUGGING
2146 regnode *stop = scan;
72f13be8 2147 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2148#else
d47053eb
RGS
2149 PERL_UNUSED_ARG(depth);
2150#endif
2151#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2152 PERL_UNUSED_ARG(flags);
2153 PERL_UNUSED_ARG(val);
07be1b83 2154#endif
07be1b83
YO
2155 DEBUG_PEEP("join",scan,depth);
2156
2157 /* Skip NOTHING, merge EXACT*. */
2158 while (n &&
2159 ( PL_regkind[OP(n)] == NOTHING ||
2160 (stringok && (OP(n) == OP(scan))))
2161 && NEXT_OFF(n)
2162 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2163
2164 if (OP(n) == TAIL || n > next)
2165 stringok = 0;
2166 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2167 DEBUG_PEEP("skip:",n,depth);
2168 NEXT_OFF(scan) += NEXT_OFF(n);
2169 next = n + NODE_STEP_REGNODE;
2170#ifdef DEBUGGING
2171 if (stringok)
2172 stop = n;
2173#endif
2174 n = regnext(n);
2175 }
2176 else if (stringok) {
786e8c11 2177 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2178 regnode * const nnext = regnext(n);
2179
2180 DEBUG_PEEP("merg",n,depth);
2181
2182 merged++;
2183 if (oldl + STR_LEN(n) > U8_MAX)
2184 break;
2185 NEXT_OFF(scan) += NEXT_OFF(n);
2186 STR_LEN(scan) += STR_LEN(n);
2187 next = n + NODE_SZ_STR(n);
2188 /* Now we can overwrite *n : */
2189 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2190#ifdef DEBUGGING
2191 stop = next - 1;
2192#endif
2193 n = nnext;
2194 if (stopnow) break;
2195 }
2196
d47053eb
RGS
2197#ifdef EXPERIMENTAL_INPLACESCAN
2198 if (flags && !NEXT_OFF(n)) {
2199 DEBUG_PEEP("atch", val, depth);
2200 if (reg_off_by_arg[OP(n)]) {
2201 ARG_SET(n, val - n);
2202 }
2203 else {
2204 NEXT_OFF(n) = val - n;
2205 }
2206 stopnow = 1;
2207 }
07be1b83
YO
2208#endif
2209 }
2210
2211 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2212 /*
2213 Two problematic code points in Unicode casefolding of EXACT nodes:
2214
2215 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2216 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2217
2218 which casefold to
2219
2220 Unicode UTF-8
2221
2222 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2223 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2224
2225 This means that in case-insensitive matching (or "loose matching",
2226 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2227 length of the above casefolded versions) can match a target string
2228 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2229 This would rather mess up the minimum length computation.
2230
2231 What we'll do is to look for the tail four bytes, and then peek
2232 at the preceding two bytes to see whether we need to decrease
2233 the minimum length by four (six minus two).
2234
2235 Thanks to the design of UTF-8, there cannot be false matches:
2236 A sequence of valid UTF-8 bytes cannot be a subsequence of
2237 another valid sequence of UTF-8 bytes.
2238
2239 */
2240 char * const s0 = STRING(scan), *s, *t;
2241 char * const s1 = s0 + STR_LEN(scan) - 1;
2242 char * const s2 = s1 - 4;
e294cc5d
JH
2243#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2244 const char t0[] = "\xaf\x49\xaf\x42";
2245#else
07be1b83 2246 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2247#endif
07be1b83
YO
2248 const char * const t1 = t0 + 3;
2249
2250 for (s = s0 + 2;
2251 s < s2 && (t = ninstr(s, s1, t0, t1));
2252 s = t + 4) {
e294cc5d
JH
2253#ifdef EBCDIC
2254 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2255 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2256#else
07be1b83
YO
2257 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2258 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2259#endif
07be1b83
YO
2260 *min -= 4;
2261 }
2262 }
2263
2264#ifdef DEBUGGING
2265 /* Allow dumping */
2266 n = scan + NODE_SZ_STR(scan);
2267 while (n <= stop) {
2268 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2269 OP(n) = OPTIMIZED;
2270 NEXT_OFF(n) = 0;
2271 }
2272 n++;
2273 }
2274#endif
2275 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2276 return stopnow;
2277}
2278
653099ff
GS
2279/* REx optimizer. Converts nodes into quickier variants "in place".
2280 Finds fixed substrings. */
2281
a0288114 2282/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2283 to the position after last scanned or to NULL. */
2284
40d049e4
YO
2285#define INIT_AND_WITHP \
2286 assert(!and_withp); \
2287 Newx(and_withp,1,struct regnode_charclass_class); \
2288 SAVEFREEPV(and_withp)
07be1b83 2289
76e3520e 2290STATIC I32
40d049e4 2291S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2292 I32 *minlenp, I32 *deltap,
40d049e4
YO
2293 regnode *last,
2294 scan_data_t *data,
2295 I32 stopparen,
2296 U8* recursed,
2297 struct regnode_charclass_class *and_withp,
2298 U32 flags, U32 depth)
c277df42
IZ
2299 /* scanp: Start here (read-write). */
2300 /* deltap: Write maxlen-minlen here. */
2301 /* last: Stop before this one. */
40d049e4
YO
2302 /* data: string data about the pattern */
2303 /* stopparen: treat close N as END */
2304 /* recursed: which subroutines have we recursed into */
2305 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2306{
97aff369 2307 dVAR;
c277df42
IZ
2308 I32 min = 0, pars = 0, code;
2309 regnode *scan = *scanp, *next;
2310 I32 delta = 0;
2311 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2312 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2313 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2314 scan_data_t data_fake;
a3621e74 2315 SV *re_trie_maxbuff = NULL;
786e8c11 2316 regnode *first_non_open = scan;
e2e6a0f1 2317 I32 stopmin = I32_MAX;
a3621e74 2318 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2319#ifdef DEBUGGING
40d049e4 2320 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2321#endif
40d049e4 2322
786e8c11 2323 if ( depth == 0 ) {
40d049e4 2324 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2325 first_non_open=regnext(first_non_open);
2326 }
2327
b81d288d 2328
c277df42
IZ
2329 while (scan && OP(scan) != END && scan < last) {
2330 /* Peephole optimizer: */
1de06328 2331 DEBUG_STUDYDATA(data,depth);
07be1b83 2332 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2333 JOIN_EXACT(scan,&min,0);
a3621e74 2334
653099ff
GS
2335 /* Follow the next-chain of the current node and optimize
2336 away all the NOTHINGs from it. */
c277df42 2337 if (OP(scan) != CURLYX) {
a3b680e6 2338 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2339 ? I32_MAX
2340 /* I32 may be smaller than U16 on CRAYs! */
2341 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2342 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2343 int noff;
2344 regnode *n = scan;
b81d288d 2345
c277df42
IZ
2346 /* Skip NOTHING and LONGJMP. */
2347 while ((n = regnext(n))
3dab1dad 2348 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2349 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2350 && off + noff < max)
2351 off += noff;
2352 if (reg_off_by_arg[OP(scan)])
2353 ARG(scan) = off;
b81d288d 2354 else
c277df42
IZ
2355 NEXT_OFF(scan) = off;
2356 }
a3621e74 2357
07be1b83 2358
3dab1dad 2359
653099ff
GS
2360 /* The principal pseudo-switch. Cannot be a switch, since we
2361 look into several different things. */
b81d288d 2362 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2363 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2364 next = regnext(scan);
2365 code = OP(scan);
a3621e74 2366 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2367
2368 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2369 /* NOTE - There is similar code to this block below for handling
2370 TRIE nodes on a re-study. If you change stuff here check there
2371 too. */
c277df42 2372 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2373 struct regnode_charclass_class accum;
d4c19fe8 2374 regnode * const startbranch=scan;
c277df42 2375
653099ff 2376 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2377 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2378 if (flags & SCF_DO_STCLASS)
830247a4 2379 cl_init_zero(pRExC_state, &accum);
a3621e74 2380
c277df42 2381 while (OP(scan) == code) {
830247a4 2382 I32 deltanext, minnext, f = 0, fake;
653099ff 2383 struct regnode_charclass_class this_class;
c277df42
IZ
2384
2385 num++;
2386 data_fake.flags = 0;
40d049e4 2387 if (data) {
2c2d71f5 2388 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2389 data_fake.last_closep = data->last_closep;
2390 }
2391 else
2392 data_fake.last_closep = &fake;
c277df42
IZ
2393 next = regnext(scan);
2394 scan = NEXTOPER(scan);
2395 if (code != BRANCH)
2396 scan = NEXTOPER(scan);
653099ff 2397 if (flags & SCF_DO_STCLASS) {
830247a4 2398 cl_init(pRExC_state, &this_class);
653099ff
GS
2399 data_fake.start_class = &this_class;
2400 f = SCF_DO_STCLASS_AND;
b81d288d 2401 }
e1901655
IZ
2402 if (flags & SCF_WHILEM_VISITED_POS)
2403 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2404
653099ff 2405 /* we suppose the run is continuous, last=next...*/
1de06328 2406 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2407 next, &data_fake,
2408 stopparen, recursed, NULL, f,depth+1);
b81d288d 2409 if (min1 > minnext)
c277df42
IZ
2410 min1 = minnext;
2411 if (max1 < minnext + deltanext)
2412 max1 = minnext + deltanext;
2413 if (deltanext == I32_MAX)
aca2d497 2414 is_inf = is_inf_internal = 1;
c277df42
IZ
2415 scan = next;
2416 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2417 pars++;
e2e6a0f1
YO
2418 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2419 if ( stopmin > minnext)
2420 stopmin = min + min1;
2421 flags &= ~SCF_DO_SUBSTR;
2422 if (data)
2423 data->flags |= SCF_SEEN_ACCEPT;
2424 }
3dab1dad
YO
2425 if (data) {
2426 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2427 data->flags |= SF_HAS_EVAL;
2c2d71f5 2428 data->whilem_c = data_fake.whilem_c;
3dab1dad 2429 }
653099ff 2430 if (flags & SCF_DO_STCLASS)
830247a4 2431 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2432 if (code == SUSPEND)
c277df42
IZ
2433 break;
2434 }
2435 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2436 min1 = 0;
2437 if (flags & SCF_DO_SUBSTR) {
2438 data->pos_min += min1;
2439 data->pos_delta += max1 - min1;
2440 if (max1 != min1 || is_inf)
2441 data->longest = &(data->longest_float);
2442 }
2443 min += min1;
2444 delta += max1 - min1;
653099ff 2445 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2446 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2447 if (min1) {
40d049e4 2448 cl_and(data->start_class, and_withp);
653099ff
GS
2449 flags &= ~SCF_DO_STCLASS;
2450 }
2451 }
2452 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2453 if (min1) {
2454 cl_and(data->start_class, &accum);
653099ff 2455 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2456 }
2457 else {
b81d288d 2458 /* Switch to OR mode: cache the old value of
de0c8cb8 2459 * data->start_class */
40d049e4
YO
2460 INIT_AND_WITHP;
2461 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2462 struct regnode_charclass_class);
2463 flags &= ~SCF_DO_STCLASS_AND;
2464 StructCopy(&accum, data->start_class,
2465 struct regnode_charclass_class);
2466 flags |= SCF_DO_STCLASS_OR;
2467 data->start_class->flags |= ANYOF_EOS;
2468 }
653099ff 2469 }
a3621e74 2470
786e8c11 2471 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2472 /* demq.
2473
2474 Assuming this was/is a branch we are dealing with: 'scan' now
2475 points at the item that follows the branch sequence, whatever
2476 it is. We now start at the beginning of the sequence and look
2477 for subsequences of
2478
786e8c11
YO
2479 BRANCH->EXACT=>x1
2480 BRANCH->EXACT=>x2
2481 tail
a3621e74
YO
2482
2483 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2484
2485 If we can find such a subseqence we need to turn the first
2486 element into a trie and then add the subsequent branch exact
2487 strings to the trie.
2488
2489 We have two cases
2490
786e8c11 2491 1. patterns where the whole set of branch can be converted.
a3621e74 2492
786e8c11 2493 2. patterns where only a subset can be converted.
a3621e74
YO
2494
2495 In case 1 we can replace the whole set with a single regop
2496 for the trie. In case 2 we need to keep the start and end
2497 branchs so
2498
2499 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2500 becomes BRANCH TRIE; BRANCH X;
2501
786e8c11
YO
2502 There is an additional case, that being where there is a
2503 common prefix, which gets split out into an EXACT like node
2504 preceding the TRIE node.
2505
2506 If x(1..n)==tail then we can do a simple trie, if not we make
2507 a "jump" trie, such that when we match the appropriate word
2508 we "jump" to the appopriate tail node. Essentailly we turn
2509 a nested if into a case structure of sorts.
a3621e74
YO
2510
2511 */
786e8c11 2512
3dab1dad 2513 int made=0;
0111c4fd
RGS
2514 if (!re_trie_maxbuff) {
2515 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2516 if (!SvIOK(re_trie_maxbuff))
2517 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2518 }
786e8c11 2519 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2520 regnode *cur;
2521 regnode *first = (regnode *)NULL;
2522 regnode *last = (regnode *)NULL;
2523 regnode *tail = scan;
2524 U8 optype = 0;
2525 U32 count=0;
2526
2527#ifdef DEBUGGING
c445ea15 2528 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2529#endif
2530 /* var tail is used because there may be a TAIL
2531 regop in the way. Ie, the exacts will point to the
2532 thing following the TAIL, but the last branch will
2533 point at the TAIL. So we advance tail. If we
2534 have nested (?:) we may have to move through several
2535 tails.
2536 */
2537
2538 while ( OP( tail ) == TAIL ) {
2539 /* this is the TAIL generated by (?:) */
2540 tail = regnext( tail );
2541 }
2542
3dab1dad 2543
a3621e74 2544 DEBUG_OPTIMISE_r({
32fc9b6a 2545 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2546 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2547 (int)depth * 2 + 2, "",
2548 "Looking for TRIE'able sequences. Tail node is: ",
2549 SvPV_nolen_const( mysv )
a3621e74
YO
2550 );
2551 });
3dab1dad 2552
a3621e74
YO
2553 /*
2554
2555 step through the branches, cur represents each
2556 branch, noper is the first thing to be matched
2557 as part of that branch and noper_next is the
2558 regnext() of that node. if noper is an EXACT
2559 and noper_next is the same as scan (our current
2560 position in the regex) then the EXACT branch is
2561 a possible optimization target. Once we have
2562 two or more consequetive such branches we can
2563 create a trie of the EXACT's contents and stich
2564 it in place. If the sequence represents all of
2565 the branches we eliminate the whole thing and
2566 replace it with a single TRIE. If it is a
2567 subsequence then we need to stitch it in. This
2568 means the first branch has to remain, and needs
2569 to be repointed at the item on the branch chain
2570 following the last branch optimized. This could
2571 be either a BRANCH, in which case the
2572 subsequence is internal, or it could be the
2573 item following the branch sequence in which
2574 case the subsequence is at the end.
2575
2576 */
2577
2578 /* dont use tail as the end marker for this traverse */
2579 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2580 regnode * const noper = NEXTOPER( cur );
be981c67 2581#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2582 regnode * const noper_next = regnext( noper );
be981c67 2583#endif
a3621e74 2584
a3621e74 2585 DEBUG_OPTIMISE_r({
32fc9b6a 2586 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2587 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2588 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2589
32fc9b6a 2590 regprop(RExC_rx, mysv, noper);
a3621e74 2591 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2592 SvPV_nolen_const(mysv));
a3621e74
YO
2593
2594 if ( noper_next ) {
32fc9b6a 2595 regprop(RExC_rx, mysv, noper_next );
a3621e74 2596 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2597 SvPV_nolen_const(mysv));
a3621e74 2598 }
3dab1dad
YO
2599 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2600 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2601 });
3dab1dad
YO
2602 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2603 : PL_regkind[ OP( noper ) ] == EXACT )
2604 || OP(noper) == NOTHING )
786e8c11
YO
2605#ifdef NOJUMPTRIE
2606 && noper_next == tail
2607#endif
2608 && count < U16_MAX)
a3621e74
YO
2609 {
2610 count++;
3dab1dad
YO
2611 if ( !first || optype == NOTHING ) {
2612 if (!first) first = cur;
a3621e74
YO
2613 optype = OP( noper );
2614 } else {
a3621e74 2615 last = cur;
a3621e74
YO
2616 }
2617 } else {
2618 if ( last ) {
786e8c11
YO
2619 make_trie( pRExC_state,
2620 startbranch, first, cur, tail, count,
2621 optype, depth+1 );
a3621e74 2622 }
3dab1dad 2623 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2624#ifdef NOJUMPTRIE
2625 && noper_next == tail
2626#endif
2627 ){
a3621e74
YO
2628 count = 1;
2629 first = cur;
2630 optype = OP( noper );
2631 } else {
2632 count = 0;
2633 first = NULL;
2634 optype = 0;
2635 }
2636 last = NULL;
2637 }
2638 }
2639 DEBUG_OPTIMISE_r({
32fc9b6a 2640 regprop(RExC_rx, mysv, cur);
a3621e74 2641 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2642 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2643 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2644
2645 });
2646 if ( last ) {
786e8c11 2647 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2648#ifdef TRIE_STUDY_OPT
786e8c11
YO
2649 if ( ((made == MADE_EXACT_TRIE &&
2650 startbranch == first)
2651 || ( first_non_open == first )) &&
5d458dd8 2652 depth==0 ) {
786e8c11 2653 flags |= SCF_TRIE_RESTUDY;
5d458dd8
YO
2654 if ( startbranch == first
2655 && scan == tail )
2656 {
2657 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2658 }
2659 }
3dab1dad 2660#endif
07be1b83 2661 }
a3621e74 2662 }
3dab1dad
YO
2663
2664 } /* do trie */
786e8c11 2665
a0ed51b3 2666 }
a3621e74 2667 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2668 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2669 } else /* single branch is optimized. */
c277df42
IZ
2670 scan = NEXTOPER(scan);
2671 continue;
a0ed51b3
LW
2672 }
2673 else if (OP(scan) == EXACT) {
cd439c50 2674 I32 l = STR_LEN(scan);
c445ea15 2675 UV uc;
a0ed51b3 2676 if (UTF) {
a3b680e6 2677 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2678 l = utf8_length(s, s + l);
9041c2e3 2679 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2680 } else {
2681 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2682 }
2683 min += l;
c277df42 2684 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2685 /* The code below prefers earlier match for fixed
2686 offset, later match for variable offset. */
2687 if (data->last_end == -1) { /* Update the start info. */
2688 data->last_start_min = data->pos_min;
2689 data->last_start_max = is_inf
b81d288d 2690 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2691 }
cd439c50 2692 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2693 if (UTF)
2694 SvUTF8_on(data->last_found);
0eda9292 2695 {
9a957fbc 2696 SV * const sv = data->last_found;
a28509cc 2697 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2698 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2699 if (mg && mg->mg_len >= 0)
5e43f467
JH
2700 mg->mg_len += utf8_length((U8*)STRING(scan),
2701 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2702 }
c277df42
IZ
2703 data->last_end = data->pos_min + l;
2704 data->pos_min += l; /* As in the first entry. */
2705 data->flags &= ~SF_BEFORE_EOL;
2706 }
653099ff
GS
2707 if (flags & SCF_DO_STCLASS_AND) {
2708 /* Check whether it is compatible with what we know already! */
2709 int compat = 1;
2710
1aa99e6b 2711 if (uc >= 0x100 ||
516a5887 2712 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2713 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2714 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2715 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2716 )
653099ff
GS
2717 compat = 0;
2718 ANYOF_CLASS_ZERO(data->start_class);
2719 ANYOF_BITMAP_ZERO(data->start_class);
2720 if (compat)
1aa99e6b 2721 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2722 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2723 if (uc < 0x100)
2724 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2725 }
2726 else if (flags & SCF_DO_STCLASS_OR) {
2727 /* false positive possible if the class is case-folded */
1aa99e6b 2728 if (uc < 0x100)
9b877dbb
IH
2729 ANYOF_BITMAP_SET(data->start_class, uc);
2730 else
2731 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2732 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2733 cl_and(data->start_class, and_withp);
653099ff
GS
2734 }
2735 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2736 }
3dab1dad 2737 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2738 I32 l = STR_LEN(scan);
1aa99e6b 2739 UV uc = *((U8*)STRING(scan));
653099ff
GS
2740
2741 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2742 if (flags & SCF_DO_SUBSTR) {
2743 assert(data);
1de06328 2744 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2745 }
a0ed51b3 2746 if (UTF) {
6136c704 2747 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2748 l = utf8_length(s, s + l);
9041c2e3 2749 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2750 }
2751 min += l;
ecaa9b9c 2752 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2753 data->pos_min += l;
653099ff
GS
2754 if (flags & SCF_DO_STCLASS_AND) {
2755 /* Check whether it is compatible with what we know already! */
2756 int compat = 1;
2757
1aa99e6b 2758 if (uc >= 0x100 ||
516a5887 2759 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2760 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2761 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2762 compat = 0;
2763 ANYOF_CLASS_ZERO(data->start_class);
2764 ANYOF_BITMAP_ZERO(data->start_class);
2765 if (compat) {
1aa99e6b 2766 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2767 data->start_class->flags &= ~ANYOF_EOS;
2768 data->start_class->flags |= ANYOF_FOLD;
2769 if (OP(scan) == EXACTFL)
2770 data->start_class->flags |= ANYOF_LOCALE;
2771 }
2772 }
2773 else if (flags & SCF_DO_STCLASS_OR) {
2774 if (data->start_class->flags & ANYOF_FOLD) {
2775 /* false positive possible if the class is case-folded.
2776 Assume that the locale settings are the same... */
1aa99e6b
IH
2777 if (uc < 0x100)
2778 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2779 data->start_class->flags &= ~ANYOF_EOS;
2780 }
40d049e4 2781 cl_and(data->start_class, and_withp);
653099ff
GS
2782 }
2783 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2784 }
bfed75c6 2785 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2786 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2787 I32 f = flags, pos_before = 0;
d4c19fe8 2788 regnode * const oscan = scan;
653099ff
GS
2789 struct regnode_charclass_class this_class;
2790 struct regnode_charclass_class *oclass = NULL;
727f22e3 2791 I32 next_is_eval = 0;
653099ff 2792
3dab1dad 2793 switch (PL_regkind[OP(scan)]) {
653099ff 2794 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2795 scan = NEXTOPER(scan);
2796 goto finish;
2797 case PLUS:
653099ff 2798 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2799 next = NEXTOPER(scan);
653099ff 2800 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2801 mincount = 1;
2802 maxcount = REG_INFTY;
c277df42
IZ
2803 next = regnext(scan);
2804 scan = NEXTOPER(scan);
2805 goto do_curly;
2806 }
2807 }
2808 if (flags & SCF_DO_SUBSTR)
2809 data->pos_min++;
2810 min++;
2811 /* Fall through. */
2812 case STAR:
653099ff
GS
2813 if (flags & SCF_DO_STCLASS) {
2814 mincount = 0;
b81d288d 2815 maxcount = REG_INFTY;
653099ff
GS
2816 next = regnext(scan);
2817 scan = NEXTOPER(scan);
2818 goto do_curly;
2819 }
b81d288d 2820 is_inf = is_inf_internal = 1;
c277df42
IZ
2821 scan = regnext(scan);
2822 if (flags & SCF_DO_SUBSTR) {
1de06328 2823 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2824 data->longest = &(data->longest_float);
2825 }
2826 goto optimize_curly_tail;
2827 case CURLY:
40d049e4
YO
2828 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2829 && (scan->flags == stopparen))
2830 {
2831 mincount = 1;
2832 maxcount = 1;
2833 } else {
2834 mincount = ARG1(scan);
2835 maxcount = ARG2(scan);
2836 }
c277df42 2837 next = regnext(scan);
cb434fcc
IZ
2838 if (OP(scan) == CURLYX) {
2839 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2840 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2841 }
c277df42 2842 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2843 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2844 do_curly:
2845 if (flags & SCF_DO_SUBSTR) {
1de06328 2846 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2847 pos_before = data->pos_min;
2848 }
2849 if (data) {
2850 fl = data->flags;
2851 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2852 if (is_inf)
2853 data->flags |= SF_IS_INF;
2854 }
653099ff 2855 if (flags & SCF_DO_STCLASS) {
830247a4 2856 cl_init(pRExC_state, &this_class);
653099ff
GS
2857 oclass = data->start_class;
2858 data->start_class = &this_class;
2859 f |= SCF_DO_STCLASS_AND;
2860 f &= ~SCF_DO_STCLASS_OR;
2861 }
e1901655
IZ
2862 /* These are the cases when once a subexpression
2863 fails at a particular position, it cannot succeed
2864 even after backtracking at the enclosing scope.
b81d288d 2865
e1901655
IZ
2866 XXXX what if minimal match and we are at the
2867 initial run of {n,m}? */
2868 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2869 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2870
c277df42 2871 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2872 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2873 last, data, stopparen, recursed, NULL,
a3621e74
YO
2874 (mincount == 0
2875 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2876
2877 if (flags & SCF_DO_STCLASS)
2878 data->start_class = oclass;
2879 if (mincount == 0 || minnext == 0) {
2880 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2881 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2882 }
2883 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2884 /* Switch to OR mode: cache the old value of
653099ff 2885 * data->start_class */
40d049e4
YO
2886 INIT_AND_WITHP;
2887 StructCopy(data->start_class, and_withp,
653099ff
GS
2888 struct regnode_charclass_class);
2889 flags &= ~SCF_DO_STCLASS_AND;
2890 StructCopy(&this_class, data->start_class,
2891 struct regnode_charclass_class);
2892 flags |= SCF_DO_STCLASS_OR;
2893 data->start_class->flags |= ANYOF_EOS;
2894 }
2895 } else { /* Non-zero len */
2896 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2897 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2898 cl_and(data->start_class, and_withp);
653099ff
GS
2899 }
2900 else if (flags & SCF_DO_STCLASS_AND)
2901 cl_and(data->start_class, &this_class);
2902 flags &= ~SCF_DO_STCLASS;
2903 }
c277df42
IZ
2904 if (!scan) /* It was not CURLYX, but CURLY. */
2905 scan = next;
041457d9
DM
2906 if ( /* ? quantifier ok, except for (?{ ... }) */
2907 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2908 && (minnext == 0) && (deltanext == 0)
99799961 2909 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2910 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2911 && ckWARN(WARN_REGEXP))
b45f050a 2912 {
830247a4 2913 vWARN(RExC_parse,
b45f050a
JF
2914 "Quantifier unexpected on zero-length expression");
2915 }
2916
c277df42 2917 min += minnext * mincount;
b81d288d 2918 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2919 && (minnext + deltanext) > 0)
2920 || deltanext == I32_MAX);
aca2d497 2921 is_inf |= is_inf_internal;
c277df42
IZ
2922 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2923
2924 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2925 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2926 && data->flags & SF_IN_PAR
2927 && !(data->flags & SF_HAS_EVAL)
2928 && !deltanext && minnext == 1 ) {
2929 /* Try to optimize to CURLYN. */
2930 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2931 regnode * const nxt1 = nxt;
497b47a8
JH
2932#ifdef DEBUGGING
2933 regnode *nxt2;
2934#endif
c277df42
IZ
2935
2936 /* Skip open. */
2937 nxt = regnext(nxt);
bfed75c6 2938 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2939 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2940 && STR_LEN(nxt) == 1))
c277df42 2941 goto nogo;
497b47a8 2942#ifdef DEBUGGING
c277df42 2943 nxt2 = nxt;
497b47a8 2944#endif
c277df42 2945 nxt = regnext(nxt);
b81d288d 2946 if (OP(nxt) != CLOSE)
c277df42 2947 goto nogo;
40d049e4
YO
2948 if (RExC_open_parens) {
2949 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2950 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2951 }
c277df42 2952 /* Now we know that nxt2 is the only contents: */
eb160463 2953 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2954 OP(oscan) = CURLYN;
2955 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2956
c277df42
IZ
2957#ifdef DEBUGGING
2958 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2959 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2960 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2961 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2962 OP(nxt + 1) = OPTIMIZED; /* was count. */
2963 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2964#endif
c277df42 2965 }
c277df42
IZ
2966 nogo:
2967
2968 /* Try optimization CURLYX => CURLYM. */
b81d288d 2969 if ( OP(oscan) == CURLYX && data
c277df42 2970 && !(data->flags & SF_HAS_PAR)
c277df42 2971 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2972 && !deltanext /* atom is fixed width */
2973 && minnext != 0 /* CURLYM can't handle zero width */
2974 ) {
c277df42
IZ
2975 /* XXXX How to optimize if data == 0? */
2976 /* Optimize to a simpler form. */
2977 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2978 regnode *nxt2;
2979
2980 OP(oscan) = CURLYM;
2981 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2982 && (OP(nxt2) != WHILEM))
c277df42
IZ
2983 nxt = nxt2;
2984 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2985 /* Need to optimize away parenths. */
2986 if (data->flags & SF_IN_PAR) {
2987 /* Set the parenth number. */
2988 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2989
b81d288d 2990 if (OP(nxt) != CLOSE)
b45f050a 2991 FAIL("Panic opt close");
eb160463 2992 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2993 if (RExC_open_parens) {
2994 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2995 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2996 }
c277df42
IZ
2997 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2998 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2999
c277df42
IZ
3000#ifdef DEBUGGING
3001 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3002 OP(nxt + 1) = OPTIMIZED; /* was count. */
3003 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3004 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3005#endif
c277df42
IZ
3006#if 0
3007 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3008 regnode *nnxt = regnext(nxt1);
b81d288d 3009
c277df42
IZ
3010 if (nnxt == nxt) {
3011 if (reg_off_by_arg[OP(nxt1)])
3012 ARG_SET(nxt1, nxt2 - nxt1);
3013 else if (nxt2 - nxt1 < U16_MAX)
3014 NEXT_OFF(nxt1) = nxt2 - nxt1;
3015 else
3016 OP(nxt) = NOTHING; /* Cannot beautify */
3017 }
3018 nxt1 = nnxt;
3019 }
3020#endif
3021 /* Optimize again: */
1de06328 3022 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3023 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3024 }
3025 else
c277df42 3026 oscan->flags = 0;
c277df42 3027 }
e1901655
IZ
3028 else if ((OP(oscan) == CURLYX)
3029 && (flags & SCF_WHILEM_VISITED_POS)
3030 /* See the comment on a similar expression above.
3031 However, this time it not a subexpression
3032 we care about, but the expression itself. */
3033 && (maxcount == REG_INFTY)
3034 && data && ++data->whilem_c < 16) {
3035 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3036 /* Find WHILEM (as in regexec.c) */
3037 regnode *nxt = oscan + NEXT_OFF(oscan);
3038
3039 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3040 nxt += ARG(nxt);
eb160463
GS
3041 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3042 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3043 }
b81d288d 3044 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3045 pars++;
3046 if (flags & SCF_DO_SUBSTR) {
c445ea15 3047 SV *last_str = NULL;
c277df42
IZ
3048 int counted = mincount != 0;
3049
3050 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3051#if defined(SPARC64_GCC_WORKAROUND)
3052 I32 b = 0;
3053 STRLEN l = 0;
cfd0369c 3054 const char *s = NULL;
5d1c421c
JH
3055 I32 old = 0;
3056
3057 if (pos_before >= data->last_start_min)
3058 b = pos_before;
3059 else
3060 b = data->last_start_min;
3061
3062 l = 0;
cfd0369c 3063 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3064 old = b - data->last_start_min;
3065
3066#else
b81d288d 3067 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3068 ? pos_before : data->last_start_min;
3069 STRLEN l;
d4c19fe8 3070 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3071 I32 old = b - data->last_start_min;
5d1c421c 3072#endif
a0ed51b3
LW
3073
3074 if (UTF)
3075 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3076
a0ed51b3 3077 l -= old;
c277df42 3078 /* Get the added string: */
79cb57f6 3079 last_str = newSVpvn(s + old, l);
0e933229
IH
3080 if (UTF)
3081 SvUTF8_on(last_str);
c277df42
IZ
3082 if (deltanext == 0 && pos_before == b) {
3083 /* What was added is a constant string */
3084 if (mincount > 1) {
3085 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3086 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3087 SvPVX_const(last_str), l, mincount - 1);
b162af07 3088 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3089 /* Add additional parts. */
b81d288d 3090 SvCUR_set(data->last_found,
c277df42
IZ
3091 SvCUR(data->last_found) - l);
3092 sv_catsv(data->last_found, last_str);
0eda9292
JH
3093 {
3094 SV * sv = data->last_found;
3095 MAGIC *mg =
3096 SvUTF8(sv) && SvMAGICAL(sv) ?
3097 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3098 if (mg && mg->mg_len >= 0)
3099 mg->mg_len += CHR_SVLEN(last_str);
3100 }
c277df42
IZ
3101 data->last_end += l * (mincount - 1);
3102 }
2a8d9689
HS
3103 } else {
3104 /* start offset must point into the last copy */
3105 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3106 data->last_start_max += is_inf ? I32_MAX
3107 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3108 }
3109 }
3110 /* It is counted once already... */
3111 data->pos_min += minnext * (mincount - counted);
3112 data->pos_delta += - counted * deltanext +
3113 (minnext + deltanext) * maxcount - minnext * mincount;
3114 if (mincount != maxcount) {
653099ff
GS
3115 /* Cannot extend fixed substrings found inside
3116 the group. */
1de06328 3117 scan_commit(pRExC_state,data,minlenp);
c277df42 3118 if (mincount && last_str) {
d4c19fe8
AL
3119 SV * const sv = data->last_found;
3120 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3121 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3122
3123 if (mg)
3124 mg->mg_len = -1;
3125 sv_setsv(sv, last_str);
c277df42 3126 data->last_end = data->pos_min;
b81d288d 3127 data->last_start_min =
a0ed51b3 3128 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3129 data->last_start_max = is_inf
3130 ? I32_MAX
c277df42 3131 : data->pos_min + data->pos_delta
a0ed51b3 3132 - CHR_SVLEN(last_str);
c277df42
IZ
3133 }
3134 data->longest = &(data->longest_float);
3135 }
aca2d497 3136 SvREFCNT_dec(last_str);
c277df42 3137 }
405ff068 3138 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3139 data->flags |= SF_HAS_EVAL;
3140 optimize_curly_tail:
c277df42 3141 if (OP(oscan) != CURLYX) {
3dab1dad 3142 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3143 && NEXT_OFF(next))
3144 NEXT_OFF(oscan) += NEXT_OFF(next);
3145 }
c277df42 3146 continue;
653099ff 3147 default: /* REF and CLUMP only? */
c277df42 3148 if (flags & SCF_DO_SUBSTR) {
1de06328 3149 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3150 data->longest = &(data->longest_float);
3151 }
aca2d497 3152 is_inf = is_inf_internal = 1;
653099ff 3153 if (flags & SCF_DO_STCLASS_OR)
830247a4 3154 cl_anything(pRExC_state, data->start_class);
653099ff 3155 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3156 break;
3157 }
a0ed51b3 3158 }
bfed75c6 3159 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3160 int value = 0;
653099ff 3161
c277df42 3162 if (flags & SCF_DO_SUBSTR) {
1de06328 3163 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3164 data->pos_min++;
3165 }
3166 min++;
653099ff
GS
3167 if (flags & SCF_DO_STCLASS) {
3168 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3169
3170 /* Some of the logic below assumes that switching
3171 locale on will only add false positives. */
3dab1dad 3172 switch (PL_regkind[OP(scan)]) {
653099ff 3173 case SANY:
653099ff
GS
3174 default:
3175 do_default:
3176 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3177 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3178 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3179 break;
3180 case REG_ANY:
3181 if (OP(scan) == SANY)
3182 goto do_default;
3183 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3184 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3185 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3186 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3187 }
3188 if (flags & SCF_DO_STCLASS_AND || !value)
3189 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3190 break;
3191 case ANYOF:
3192 if (flags & SCF_DO_STCLASS_AND)
3193 cl_and(data->start_class,
3194 (struct regnode_charclass_class*)scan);
3195 else
830247a4 3196 cl_or(pRExC_state, data->start_class,
653099ff
GS
3197 (struct regnode_charclass_class*)scan);
3198 break;
3199 case ALNUM:
3200 if (flags & SCF_DO_STCLASS_AND) {
3201 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3202 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3203 for (value = 0; value < 256; value++)
3204 if (!isALNUM(value))
3205 ANYOF_BITMAP_CLEAR(data->start_class, value);
3206 }
3207 }
3208 else {
3209 if (data->start_class->flags & ANYOF_LOCALE)
3210 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3211 else {
3212 for (value = 0; value < 256; value++)
3213 if (isALNUM(value))
b81d288d 3214 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3215 }
3216 }
3217 break;
3218 case ALNUML:
3219 if (flags & SCF_DO_STCLASS_AND) {
3220 if (data->start_class->flags & ANYOF_LOCALE)
3221 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3222 }
3223 else {
3224 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3225 data->start_class->flags |= ANYOF_LOCALE;
3226 }
3227 break;
3228 case NALNUM:
3229 if (flags & SCF_DO_STCLASS_AND) {
3230 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3231 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3232 for (value = 0; value < 256; value++)
3233 if (isALNUM(value))
3234 ANYOF_BITMAP_CLEAR(data->start_class, value);
3235 }
3236 }
3237 else {
3238 if (data->start_class->flags & ANYOF_LOCALE)
3239 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3240 else {
3241 for (value = 0; value < 256; value++)
3242 if (!isALNUM(value))
b81d288d 3243 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3244 }
3245 }
3246 break;
3247 case NALNUML:
3248 if (flags & SCF_DO_STCLASS_AND) {
3249 if (data->start_class->flags & ANYOF_LOCALE)
3250 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3251 }
3252 else {
3253 data->start_class->flags |= ANYOF_LOCALE;
3254 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3255 }
3256 break;
3257 case SPACE:
3258 if (flags & SCF_DO_STCLASS_AND) {
3259 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3260 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3261 for (value = 0; value < 256; value++)
3262 if (!isSPACE(value))
3263 ANYOF_BITMAP_CLEAR(data->start_class, value);
3264 }
3265 }
3266 else {
3267 if (data->start_class->flags & ANYOF_LOCALE)
3268 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3269 else {
3270 for (value = 0; value < 256; value++)
3271 if (isSPACE(value))
b81d288d 3272 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3273 }
3274 }
3275 break;
3276 case SPACEL:
3277 if (flags & SCF_DO_STCLASS_AND) {
3278 if (data->start_class->flags & ANYOF_LOCALE)
3279 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3280 }
3281 else {
3282 data->start_class->flags |= ANYOF_LOCALE;
3283 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3284 }
3285 break;
3286 case NSPACE:
3287 if (flags & SCF_DO_STCLASS_AND) {
3288 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3289 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3290 for (value = 0; value < 256; value++)
3291 if (isSPACE(value))
3292 ANYOF_BITMAP_CLEAR(data->start_class, value);
3293 }
3294 }
3295 else {
3296 if (data->start_class->flags & ANYOF_LOCALE)
3297 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3298 else {
3299 for (value = 0; value < 256; value++)
3300 if (!isSPACE(value))
b81d288d 3301 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3302 }
3303 }
3304 break;
3305 case NSPACEL:
3306 if (flags & SCF_DO_STCLASS_AND) {
3307 if (data->start_class->flags & ANYOF_LOCALE) {
3308 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3309 for (value = 0; value < 256; value++)
3310 if (!isSPACE(value))
3311 ANYOF_BITMAP_CLEAR(data->start_class, value);
3312 }
3313 }
3314 else {
3315 data->start_class->flags |= ANYOF_LOCALE;
3316 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3317 }
3318 break;
3319 case DIGIT:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3322 for (value = 0; value < 256; value++)
3323 if (!isDIGIT(value))
3324 ANYOF_BITMAP_CLEAR(data->start_class, value);
3325 }
3326 else {
3327 if (data->start_class->flags & ANYOF_LOCALE)
3328 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3329 else {
3330 for (value = 0; value < 256; value++)
3331 if (isDIGIT(value))
b81d288d 3332 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3333 }
3334 }
3335 break;
3336 case NDIGIT:
3337 if (flags & SCF_DO_STCLASS_AND) {
3338 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3339 for (value = 0; value < 256; value++)
3340 if (isDIGIT(value))
3341 ANYOF_BITMAP_CLEAR(data->start_class, value);
3342 }
3343 else {
3344 if (data->start_class->flags & ANYOF_LOCALE)
3345 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3346 else {
3347 for (value = 0; value < 256; value++)
3348 if (!isDIGIT(value))
b81d288d 3349 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3350 }
3351 }
3352 break;
3353 }
3354 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3355 cl_and(data->start_class, and_withp);
653099ff
GS
3356 flags &= ~SCF_DO_STCLASS;
3357 }
a0ed51b3 3358 }
3dab1dad 3359 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3360 data->flags |= (OP(scan) == MEOL
3361 ? SF_BEFORE_MEOL
3362 : SF_BEFORE_SEOL);
a0ed51b3 3363 }
3dab1dad 3364 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3365 /* Lookbehind, or need to calculate parens/evals/stclass: */
3366 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3367 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3368 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3369 || OP(scan) == UNLESSM )
3370 {
3371 /* Negative Lookahead/lookbehind
3372 In this case we can't do fixed string optimisation.
3373 */
3374
3375 I32 deltanext, minnext, fake = 0;
3376 regnode *nscan;
3377 struct regnode_charclass_class intrnl;
3378 int f = 0;
3379
3380 data_fake.flags = 0;
3381 if (data) {
3382 data_fake.whilem_c = data->whilem_c;
3383 data_fake.last_closep = data->last_closep;
a0ed51b3 3384 }
1de06328
YO
3385 else
3386 data_fake.last_closep = &fake;
3387 if ( flags & SCF_DO_STCLASS && !scan->flags
3388 && OP(scan) == IFMATCH ) { /* Lookahead */
3389 cl_init(pRExC_state, &intrnl);
3390 data_fake.start_class = &intrnl;
3391 f |= SCF_DO_STCLASS_AND;
c277df42 3392 }
1de06328
YO
3393 if (flags & SCF_WHILEM_VISITED_POS)
3394 f |= SCF_WHILEM_VISITED_POS;
3395 next = regnext(scan);
3396 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3397 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3398 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3399 if (scan->flags) {
3400 if (deltanext) {
3401 vFAIL("Variable length lookbehind not implemented");
3402 }
3403 else if (minnext > (I32)U8_MAX) {
3404 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3405 }
3406 scan->flags = (U8)minnext;
3407 }
3408 if (data) {
3409 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3410 pars++;
3411 if (data_fake.flags & SF_HAS_EVAL)
3412 data->flags |= SF_HAS_EVAL;
3413 data->whilem_c = data_fake.whilem_c;
3414 }
3415 if (f & SCF_DO_STCLASS_AND) {
3416 const int was = (data->start_class->flags & ANYOF_EOS);
3417
3418 cl_and(data->start_class, &intrnl);
3419 if (was)
3420 data->start_class->flags |= ANYOF_EOS;
3421 }
be8e71aa 3422 }
1de06328
YO
3423#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3424 else {
3425 /* Positive Lookahead/lookbehind
3426 In this case we can do fixed string optimisation,
3427 but we must be careful about it. Note in the case of
3428 lookbehind the positions will be offset by the minimum
3429 length of the pattern, something we won't know about
3430 until after the recurse.
3431 */
3432 I32 deltanext, fake = 0;
3433 regnode *nscan;
3434 struct regnode_charclass_class intrnl;
3435 int f = 0;
3436 /* We use SAVEFREEPV so that when the full compile
3437 is finished perl will clean up the allocated
3438 minlens when its all done. This was we don't
3439 have to worry about freeing them when we know
3440 they wont be used, which would be a pain.
3441 */
3442 I32 *minnextp;
3443 Newx( minnextp, 1, I32 );
3444 SAVEFREEPV(minnextp);
3445
3446 if (data) {
3447 StructCopy(data, &data_fake, scan_data_t);
3448 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3449 f |= SCF_DO_SUBSTR;
3450 if (scan->flags)
3451 scan_commit(pRExC_state, &data_fake,minlenp);
3452 data_fake.last_found=newSVsv(data->last_found);
3453 }
3454 }
3455 else
3456 data_fake.last_closep = &fake;
3457 data_fake.flags = 0;
3458 if (is_inf)
3459 data_fake.flags |= SF_IS_INF;
3460 if ( flags & SCF_DO_STCLASS && !scan->flags
3461 && OP(scan) == IFMATCH ) { /* Lookahead */
3462 cl_init(pRExC_state, &intrnl);
3463 data_fake.start_class = &intrnl;
3464 f |= SCF_DO_STCLASS_AND;
3465 }
3466 if (flags & SCF_WHILEM_VISITED_POS)
3467 f |= SCF_WHILEM_VISITED_POS;
3468 next = regnext(scan);
3469 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3470
3471 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3472 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3473 if (scan->flags) {
3474 if (deltanext) {
3475 vFAIL("Variable length lookbehind not implemented");
3476 }
3477 else if (*minnextp > (I32)U8_MAX) {
3478 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3479 }
3480 scan->flags = (U8)*minnextp;
3481 }
40d049e4 3482
1de06328 3483 *minnextp += min;
40d049e4 3484
1de06328
YO
3485 if (f & SCF_DO_STCLASS_AND) {
3486 const int was = (data->start_class->flags & ANYOF_EOS);
3487
3488 cl_and(data->start_class, &intrnl);
3489 if (was)
3490 data->start_class->flags |= ANYOF_EOS;
40d049e4 3491 }
1de06328
YO
3492 if (data) {
3493 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3494 pars++;
3495 if (data_fake.flags & SF_HAS_EVAL)
3496 data->flags |= SF_HAS_EVAL;
3497 data->whilem_c = data_fake.whilem_c;
3498 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3499 if (RExC_rx->minlen<*minnextp)
3500 RExC_rx->minlen=*minnextp;
3501 scan_commit(pRExC_state, &data_fake, minnextp);
3502 SvREFCNT_dec(data_fake.last_found);
3503
3504 if ( data_fake.minlen_fixed != minlenp )
3505 {
3506 data->offset_fixed= data_fake.offset_fixed;
3507 data->minlen_fixed= data_fake.minlen_fixed;
3508 data->lookbehind_fixed+= scan->flags;
3509 }
3510 if ( data_fake.minlen_float != minlenp )
3511 {
3512 data->minlen_float= data_fake.minlen_float;
3513 data->offset_float_min=data_fake.offset_float_min;
3514 data->offset_float_max=data_fake.offset_float_max;
3515 data->lookbehind_float+= scan->flags;
3516 }
3517 }
3518 }
3519
653099ff 3520
653099ff 3521 }
1de06328 3522#endif
a0ed51b3
LW
3523 }
3524 else if (OP(scan) == OPEN) {
40d049e4
YO
3525 if (stopparen != (I32)ARG(scan))
3526 pars++;
a0ed51b3 3527 }
cb434fcc 3528 else if (OP(scan) == CLOSE) {
40d049e4
YO
3529 if (stopparen == (I32)ARG(scan)) {
3530 break;
3531 }
eb160463 3532 if ((I32)ARG(scan) == is_par) {
cb434fcc 3533 next = regnext(scan);
c277df42 3534
cb434fcc
IZ
3535 if ( next && (OP(next) != WHILEM) && next < last)
3536 is_par = 0; /* Disable optimization */
3537 }
3538 if (data)
3539 *(data->last_closep) = ARG(scan);
a0ed51b3 3540 }
1a147d38 3541 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3542 /* set the pointer */
3543 I32 paren;
3544 regnode *start;
3545 regnode *end;
1a147d38 3546 if (OP(scan) == GOSUB) {
40d049e4
YO
3547 paren = ARG(scan);
3548 RExC_recurse[ARG2L(scan)] = scan;
3549 start = RExC_open_parens[paren-1];
3550 end = RExC_close_parens[paren-1];
3551 } else {
3552 paren = 0;
3553 start = RExC_rx->program + 1;
3554 end = RExC_opend;
3555 }
3556 assert(start);
3557 assert(end);
3558 if (!recursed) {
3559 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3560 SAVEFREEPV(recursed);
3561 }
3562 if (!PAREN_TEST(recursed,paren+1)) {
3563 I32 deltanext = 0;
3564 PAREN_SET(recursed,paren+1);
3565
3566 DEBUG_PEEP("goto",start,depth);
3567 min += study_chunk(
3568 pRExC_state,
3569 &start,
3570 minlenp,
3571 &deltanext,
3572 end+1,
3573 data,
3574 paren,
3575 recursed,
3576 and_withp,
3577 flags,depth+1);
3578 delta+=deltanext;
3579 if (deltanext == I32_MAX) {
3580 is_inf = is_inf_internal = 1;
3581 delta=deltanext;
3582 }
3583 DEBUG_PEEP("rtrn",end,depth);
3584 PAREN_UNSET(recursed,paren+1);
3585 } else {
3586 if (flags & SCF_DO_SUBSTR) {
3587 scan_commit(pRExC_state,data,minlenp);
3588 data->longest = &(data->longest_float);
3589 }
3590 is_inf = is_inf_internal = 1;
3591 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3592 cl_anything(pRExC_state, data->start_class);
3593 flags &= ~SCF_DO_STCLASS;
3594 }
3595 }
a0ed51b3 3596 else if (OP(scan) == EVAL) {
c277df42
IZ
3597 if (data)
3598 data->flags |= SF_HAS_EVAL;
3599 }
e2e6a0f1 3600 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
40d049e4
YO
3601 if (flags & SCF_DO_SUBSTR) {
3602 scan_commit(pRExC_state,data,minlenp);
3603 flags &= ~SCF_DO_SUBSTR;
3604 }
e2e6a0f1
YO
3605 if (data && OP(scan)==ACCEPT) {
3606 data->flags |= SCF_SEEN_ACCEPT;
3607 if (stopmin > min)
3608 stopmin = min;
3609 }
40d049e4
YO
3610 }
3611 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3612 {
0f5d15d6 3613 if (flags & SCF_DO_SUBSTR) {
1de06328 3614 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3615 data->longest = &(data->longest_float);
3616 }
3617 is_inf = is_inf_internal = 1;
653099ff 3618 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3619 cl_anything(pRExC_state, data->start_class);
96776eda 3620 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3621 }
786e8c11 3622#ifdef TRIE_STUDY_OPT
40d049e4 3623#ifdef FULL_TRIE_STUDY
786e8c11
YO
3624 else if (PL_regkind[OP(scan)] == TRIE) {
3625 /* NOTE - There is similar code to this block above for handling
40d049e4 3626 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3627 check there too. */
7f69552c 3628 regnode *trie_node= scan;
786e8c11
YO
3629 regnode *tail= regnext(scan);
3630 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3631 I32 max1 = 0, min1 = I32_MAX;
3632 struct regnode_charclass_class accum;
3633
3634 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3635 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3636 if (flags & SCF_DO_STCLASS)
3637 cl_init_zero(pRExC_state, &accum);
3638
3639 if (!trie->jump) {
3640 min1= trie->minlen;
3641 max1= trie->maxlen;
3642 } else {
3643 const regnode *nextbranch= NULL;
3644 U32 word;
3645
3646 for ( word=1 ; word <= trie->wordcount ; word++)
3647 {
3648 I32 deltanext=0, minnext=0, f = 0, fake;
3649 struct regnode_charclass_class this_class;
3650
3651 data_fake.flags = 0;
3652 if (data) {
3653 data_fake.whilem_c = data->whilem_c;
3654 data_fake.last_closep = data->last_closep;
3655 }
3656 else
3657 data_fake.last_closep = &fake;
3658
3659 if (flags & SCF_DO_STCLASS) {
3660 cl_init(pRExC_state, &this_class);
3661 data_fake.start_class = &this_class;
3662 f = SCF_DO_STCLASS_AND;
3663 }
3664 if (flags & SCF_WHILEM_VISITED_POS)
3665 f |= SCF_WHILEM_VISITED_POS;
3666
3667 if (trie->jump[word]) {
3668 if (!nextbranch)
7f69552c
YO
3669 nextbranch = trie_node + trie->jump[0];
3670 scan= trie_node + trie->jump[word];
786e8c11
YO
3671 /* We go from the jump point to the branch that follows
3672 it. Note this means we need the vestigal unused branches
3673 even though they arent otherwise used.
3674 */
40d049e4
YO
3675 minnext = study_chunk(pRExC_state, &scan, minlenp,
3676 &deltanext, (regnode *)nextbranch, &data_fake,
3677 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3678 }
3679 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3680 nextbranch= regnext((regnode*)nextbranch);
3681
3682 if (min1 > (I32)(minnext + trie->minlen))
3683 min1 = minnext + trie->minlen;
3684 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3685 max1 = minnext + deltanext + trie->maxlen;
3686 if (deltanext == I32_MAX)
3687 is_inf = is_inf_internal = 1;
3688
3689 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3690 pars++;
e2e6a0f1
YO
3691 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3692 if ( stopmin > min + min1)
3693 stopmin = min + min1;
3694 flags &= ~SCF_DO_SUBSTR;
3695 if (data)
3