This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
assert() that SvPV_set() isn't used on arrays.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
40d049e4
YO
120 regnode **open_parens; /* pointers to open parens */
121 regnode **close_parens; /* pointers to close parens */
122 regnode *opend; /* END node in program */
1aa99e6b 123 I32 utf8;
6bda09f9 124 HV *charnames; /* cache of named sequences */
81714fb9 125 HV *paren_names; /* Paren names */
40d049e4
YO
126 regnode **recurse; /* Recurse regops */
127 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
128#if ADD_TO_REGEXEC
129 char *starttry; /* -Dr: where regtry was called. */
130#define RExC_starttry (pRExC_state->starttry)
131#endif
3dab1dad 132#ifdef DEBUGGING
be8e71aa 133 const char *lastparse;
3dab1dad
YO
134 I32 lastnum;
135#define RExC_lastparse (pRExC_state->lastparse)
136#define RExC_lastnum (pRExC_state->lastnum)
137#endif
830247a4
IZ
138} RExC_state_t;
139
e2509266 140#define RExC_flags (pRExC_state->flags)
830247a4
IZ
141#define RExC_precomp (pRExC_state->precomp)
142#define RExC_rx (pRExC_state->rx)
fac92740 143#define RExC_start (pRExC_state->start)
830247a4
IZ
144#define RExC_end (pRExC_state->end)
145#define RExC_parse (pRExC_state->parse)
146#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 147#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 148#define RExC_emit (pRExC_state->emit)
fac92740 149#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
150#define RExC_naughty (pRExC_state->naughty)
151#define RExC_sawback (pRExC_state->sawback)
152#define RExC_seen (pRExC_state->seen)
153#define RExC_size (pRExC_state->size)
154#define RExC_npar (pRExC_state->npar)
155#define RExC_extralen (pRExC_state->extralen)
156#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
157#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 158#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 159#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
160#define RExC_open_parens (pRExC_state->open_parens)
161#define RExC_close_parens (pRExC_state->close_parens)
162#define RExC_opend (pRExC_state->opend)
81714fb9 163#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
164#define RExC_recurse (pRExC_state->recurse)
165#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 166
a687059c
LW
167#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
168#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
169 ((*s) == '{' && regcurly(s)))
a687059c 170
35c8bce7
LW
171#ifdef SPSTART
172#undef SPSTART /* dratted cpp namespace... */
173#endif
a687059c
LW
174/*
175 * Flags to be passed up and down.
176 */
a687059c 177#define WORST 0 /* Worst case. */
821b33a5 178#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
179#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
180#define SPSTART 0x4 /* Starts with * or +. */
181#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 182
3dab1dad
YO
183#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
184
07be1b83
YO
185/* whether trie related optimizations are enabled */
186#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
187#define TRIE_STUDY_OPT
786e8c11 188#define FULL_TRIE_STUDY
07be1b83
YO
189#define TRIE_STCLASS
190#endif
1de06328
YO
191
192
40d049e4
YO
193
194#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
195#define PBITVAL(paren) (1 << ((paren) & 7))
196#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
197#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
198#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
199
200
1de06328
YO
201/* About scan_data_t.
202
203 During optimisation we recurse through the regexp program performing
204 various inplace (keyhole style) optimisations. In addition study_chunk
205 and scan_commit populate this data structure with information about
206 what strings MUST appear in the pattern. We look for the longest
207 string that must appear for at a fixed location, and we look for the
208 longest string that may appear at a floating location. So for instance
209 in the pattern:
210
211 /FOO[xX]A.*B[xX]BAR/
212
213 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
214 strings (because they follow a .* construct). study_chunk will identify
215 both FOO and BAR as being the longest fixed and floating strings respectively.
216
217 The strings can be composites, for instance
218
219 /(f)(o)(o)/
220
221 will result in a composite fixed substring 'foo'.
222
223 For each string some basic information is maintained:
224
225 - offset or min_offset
226 This is the position the string must appear at, or not before.
227 It also implicitly (when combined with minlenp) tells us how many
228 character must match before the string we are searching.
229 Likewise when combined with minlenp and the length of the string
230 tells us how many characters must appear after the string we have
231 found.
232
233 - max_offset
234 Only used for floating strings. This is the rightmost point that
235 the string can appear at. Ifset to I32 max it indicates that the
236 string can occur infinitely far to the right.
237
238 - minlenp
239 A pointer to the minimum length of the pattern that the string
240 was found inside. This is important as in the case of positive
241 lookahead or positive lookbehind we can have multiple patterns
242 involved. Consider
243
244 /(?=FOO).*F/
245
246 The minimum length of the pattern overall is 3, the minimum length
247 of the lookahead part is 3, but the minimum length of the part that
248 will actually match is 1. So 'FOO's minimum length is 3, but the
249 minimum length for the F is 1. This is important as the minimum length
250 is used to determine offsets in front of and behind the string being
251 looked for. Since strings can be composites this is the length of the
252 pattern at the time it was commited with a scan_commit. Note that
253 the length is calculated by study_chunk, so that the minimum lengths
254 are not known until the full pattern has been compiled, thus the
255 pointer to the value.
256
257 - lookbehind
258
259 In the case of lookbehind the string being searched for can be
260 offset past the start point of the final matching string.
261 If this value was just blithely removed from the min_offset it would
262 invalidate some of the calculations for how many chars must match
263 before or after (as they are derived from min_offset and minlen and
264 the length of the string being searched for).
265 When the final pattern is compiled and the data is moved from the
266 scan_data_t structure into the regexp structure the information
267 about lookbehind is factored in, with the information that would
268 have been lost precalculated in the end_shift field for the
269 associated string.
270
271 The fields pos_min and pos_delta are used to store the minimum offset
272 and the delta to the maximum offset at the current point in the pattern.
273
274*/
2c2d71f5
JH
275
276typedef struct scan_data_t {
1de06328
YO
277 /*I32 len_min; unused */
278 /*I32 len_delta; unused */
2c2d71f5
JH
279 I32 pos_min;
280 I32 pos_delta;
281 SV *last_found;
1de06328 282 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
283 I32 last_start_min;
284 I32 last_start_max;
1de06328
YO
285 SV **longest; /* Either &l_fixed, or &l_float. */
286 SV *longest_fixed; /* longest fixed string found in pattern */
287 I32 offset_fixed; /* offset where it starts */
288 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
289 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
290 SV *longest_float; /* longest floating string found in pattern */
291 I32 offset_float_min; /* earliest point in string it can appear */
292 I32 offset_float_max; /* latest point in string it can appear */
293 I32 *minlen_float; /* pointer to the minlen relevent to the string */
294 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
295 I32 flags;
296 I32 whilem_c;
cb434fcc 297 I32 *last_closep;
653099ff 298 struct regnode_charclass_class *start_class;
2c2d71f5
JH
299} scan_data_t;
300
a687059c 301/*
e50aee73 302 * Forward declarations for pregcomp()'s friends.
a687059c 303 */
a0d0e21e 304
27da23d5 305static const scan_data_t zero_scan_data =
1de06328 306 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
307
308#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
309#define SF_BEFORE_SEOL 0x0001
310#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
311#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
312#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
313
09b7f37c
CB
314#ifdef NO_UNARY_PLUS
315# define SF_FIX_SHIFT_EOL (0+2)
316# define SF_FL_SHIFT_EOL (0+4)
317#else
318# define SF_FIX_SHIFT_EOL (+2)
319# define SF_FL_SHIFT_EOL (+4)
320#endif
c277df42
IZ
321
322#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
323#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
324
325#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
326#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
327#define SF_IS_INF 0x0040
328#define SF_HAS_PAR 0x0080
329#define SF_IN_PAR 0x0100
330#define SF_HAS_EVAL 0x0200
331#define SCF_DO_SUBSTR 0x0400
653099ff
GS
332#define SCF_DO_STCLASS_AND 0x0800
333#define SCF_DO_STCLASS_OR 0x1000
334#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 335#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 336
786e8c11
YO
337#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
338
07be1b83 339
eb160463 340#define UTF (RExC_utf8 != 0)
e2509266
JH
341#define LOC ((RExC_flags & PMf_LOCALE) != 0)
342#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 343
ffc61ed2 344#define OOB_UNICODE 12345678
93733859 345#define OOB_NAMEDCLASS -1
b8c5462f 346
a0ed51b3
LW
347#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
348#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
349
8615cb43 350
b45f050a
JF
351/* length of regex to show in messages that don't mark a position within */
352#define RegexLengthToShowInErrorMessages 127
353
354/*
355 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
356 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
357 * op/pragma/warn/regcomp.
358 */
7253e4e3
RK
359#define MARKER1 "<-- HERE" /* marker as it appears in the description */
360#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 361
7253e4e3 362#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
363
364/*
365 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
366 * arg. Show regex, up to a maximum length. If it's too long, chop and add
367 * "...".
368 */
ccb2c380 369#define FAIL(msg) STMT_START { \
bfed75c6 370 const char *ellipses = ""; \
ccb2c380
MP
371 IV len = RExC_end - RExC_precomp; \
372 \
373 if (!SIZE_ONLY) \
374 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
375 if (len > RegexLengthToShowInErrorMessages) { \
376 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
377 len = RegexLengthToShowInErrorMessages - 10; \
378 ellipses = "..."; \
379 } \
380 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
381 msg, (int)len, RExC_precomp, ellipses); \
382} STMT_END
8615cb43 383
b45f050a 384/*
b45f050a
JF
385 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
386 */
ccb2c380 387#define Simple_vFAIL(m) STMT_START { \
a28509cc 388 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
389 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
b45f050a
JF
392
393/*
394 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
395 */
ccb2c380
MP
396#define vFAIL(m) STMT_START { \
397 if (!SIZE_ONLY) \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
399 Simple_vFAIL(m); \
400} STMT_END
b45f050a
JF
401
402/*
403 * Like Simple_vFAIL(), but accepts two arguments.
404 */
ccb2c380 405#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
408 (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
413 */
ccb2c380
MP
414#define vFAIL2(m,a1) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL2(m, a1); \
418} STMT_END
b45f050a
JF
419
420
421/*
422 * Like Simple_vFAIL(), but accepts three arguments.
423 */
ccb2c380 424#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
432 */
ccb2c380
MP
433#define vFAIL3(m,a1,a2) STMT_START { \
434 if (!SIZE_ONLY) \
435 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
436 Simple_vFAIL3(m, a1, a2); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Like Simple_vFAIL(), but accepts four arguments.
441 */
ccb2c380 442#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a 447
ccb2c380 448#define vWARN(loc,m) STMT_START { \
a28509cc 449 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
450 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
451 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
453
454#define vWARNdep(loc,m) STMT_START { \
a28509cc 455 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
456 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
457 "%s" REPORT_LOCATION, \
458 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
460
461
462#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 463 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
464 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
465 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
466} STMT_END
467
468#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 469 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
470 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
471 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 475 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
476 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
477 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
9d1d55b5 485
8615cb43 486
cd439c50 487/* Allow for side effects in s */
ccb2c380
MP
488#define REGC(c,s) STMT_START { \
489 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
490} STMT_END
cd439c50 491
fac92740
MJD
492/* Macros for recording node offsets. 20001227 mjd@plover.com
493 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
494 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
495 * Element 0 holds the number n.
07be1b83 496 * Position is 1 indexed.
fac92740
MJD
497 */
498
ccb2c380
MP
499#define Set_Node_Offset_To_R(node,byte) STMT_START { \
500 if (! SIZE_ONLY) { \
501 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 502 __LINE__, (node), (int)(byte))); \
ccb2c380 503 if((node) < 0) { \
551405c4 504 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
505 } else { \
506 RExC_offsets[2*(node)-1] = (byte); \
507 } \
508 } \
509} STMT_END
510
511#define Set_Node_Offset(node,byte) \
512 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
513#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
514
515#define Set_Node_Length_To_R(node,len) STMT_START { \
516 if (! SIZE_ONLY) { \
517 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 518 __LINE__, (int)(node), (int)(len))); \
ccb2c380 519 if((node) < 0) { \
551405c4 520 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
521 } else { \
522 RExC_offsets[2*(node)] = (len); \
523 } \
524 } \
525} STMT_END
526
527#define Set_Node_Length(node,len) \
528 Set_Node_Length_To_R((node)-RExC_emit_start, len)
529#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
530#define Set_Node_Cur_Length(node) \
531 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
532
533/* Get offsets and lengths */
534#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
535#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
536
07be1b83
YO
537#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
538 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
539 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
540} STMT_END
541
542
543#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
544#define EXPERIMENTAL_INPLACESCAN
545#endif
546
1de06328 547#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 548DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
549 PerlIO_printf(Perl_debug_log, \
550 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
551 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
552 (int)(depth)*2, "", \
553 (IV)((data)->pos_min), \
554 (IV)((data)->pos_delta), \
555 (IV)((data)->flags), \
556 (IV)((data)->whilem_c), \
557 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
558 ); \
559 if ((data)->last_found) \
560 PerlIO_printf(Perl_debug_log, \
561 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
562 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
563 SvPVX_const((data)->last_found), \
564 (IV)((data)->last_end), \
565 (IV)((data)->last_start_min), \
566 (IV)((data)->last_start_max), \
567 ((data)->longest && \
568 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
569 SvPVX_const((data)->longest_fixed), \
570 (IV)((data)->offset_fixed), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_float)) ? "*" : "", \
573 SvPVX_const((data)->longest_float), \
574 (IV)((data)->offset_float_min), \
575 (IV)((data)->offset_float_max) \
576 ); \
577 PerlIO_printf(Perl_debug_log,"\n"); \
578});
579
acfe0abc 580static void clear_re(pTHX_ void *r);
4327152a 581
653099ff 582/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 583 Update the longest found anchored substring and the longest found
653099ff
GS
584 floating substrings if needed. */
585
4327152a 586STATIC void
1de06328 587S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 588{
e1ec3a88
AL
589 const STRLEN l = CHR_SVLEN(data->last_found);
590 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 591 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 592
c277df42 593 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 594 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
595 if (*data->longest == data->longest_fixed) {
596 data->offset_fixed = l ? data->last_start_min : data->pos_min;
597 if (data->flags & SF_BEFORE_EOL)
b81d288d 598 data->flags
c277df42
IZ
599 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
600 else
601 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
602 data->minlen_fixed=minlenp;
603 data->lookbehind_fixed=0;
a0ed51b3
LW
604 }
605 else {
c277df42 606 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
607 data->offset_float_max = (l
608 ? data->last_start_max
c277df42 609 : data->pos_min + data->pos_delta);
9051bda5
HS
610 if ((U32)data->offset_float_max > (U32)I32_MAX)
611 data->offset_float_max = I32_MAX;
c277df42 612 if (data->flags & SF_BEFORE_EOL)
b81d288d 613 data->flags
c277df42
IZ
614 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
615 else
616 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
617 data->minlen_float=minlenp;
618 data->lookbehind_float=0;
c277df42
IZ
619 }
620 }
621 SvCUR_set(data->last_found, 0);
0eda9292 622 {
a28509cc 623 SV * const sv = data->last_found;
097eb12c
AL
624 if (SvUTF8(sv) && SvMAGICAL(sv)) {
625 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
626 if (mg)
627 mg->mg_len = 0;
628 }
0eda9292 629 }
c277df42
IZ
630 data->last_end = -1;
631 data->flags &= ~SF_BEFORE_EOL;
1de06328 632 DEBUG_STUDYDATA(data,0);
c277df42
IZ
633}
634
653099ff
GS
635/* Can match anything (initialization) */
636STATIC void
097eb12c 637S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 638{
653099ff 639 ANYOF_CLASS_ZERO(cl);
f8bef550 640 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 641 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
642 if (LOC)
643 cl->flags |= ANYOF_LOCALE;
644}
645
646/* Can match anything (initialization) */
647STATIC int
5f66b61c 648S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
649{
650 int value;
651
aaa51d5e 652 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
653 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
654 return 1;
1aa99e6b
IH
655 if (!(cl->flags & ANYOF_UNICODE_ALL))
656 return 0;
10edeb5d 657 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 658 return 0;
653099ff
GS
659 return 1;
660}
661
662/* Can match anything (initialization) */
663STATIC void
097eb12c 664S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 665{
8ecf7187 666 Zero(cl, 1, struct regnode_charclass_class);
653099ff 667 cl->type = ANYOF;
830247a4 668 cl_anything(pRExC_state, cl);
653099ff
GS
669}
670
671STATIC void
097eb12c 672S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 673{
8ecf7187 674 Zero(cl, 1, struct regnode_charclass_class);
653099ff 675 cl->type = ANYOF;
830247a4 676 cl_anything(pRExC_state, cl);
653099ff
GS
677 if (LOC)
678 cl->flags |= ANYOF_LOCALE;
679}
680
681/* 'And' a given class with another one. Can create false positives */
682/* We assume that cl is not inverted */
683STATIC void
5f66b61c 684S_cl_and(struct regnode_charclass_class *cl,
a28509cc 685 const struct regnode_charclass_class *and_with)
653099ff 686{
40d049e4
YO
687
688 assert(and_with->type == ANYOF);
653099ff
GS
689 if (!(and_with->flags & ANYOF_CLASS)
690 && !(cl->flags & ANYOF_CLASS)
691 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
692 && !(and_with->flags & ANYOF_FOLD)
693 && !(cl->flags & ANYOF_FOLD)) {
694 int i;
695
696 if (and_with->flags & ANYOF_INVERT)
697 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
698 cl->bitmap[i] &= ~and_with->bitmap[i];
699 else
700 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
701 cl->bitmap[i] &= and_with->bitmap[i];
702 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
703 if (!(and_with->flags & ANYOF_EOS))
704 cl->flags &= ~ANYOF_EOS;
1aa99e6b 705
14ebb1a2
JH
706 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
707 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
708 cl->flags &= ~ANYOF_UNICODE_ALL;
709 cl->flags |= ANYOF_UNICODE;
710 ARG_SET(cl, ARG(and_with));
711 }
14ebb1a2
JH
712 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
713 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 714 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
715 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
716 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 717 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
718}
719
720/* 'OR' a given class with another one. Can create false positives */
721/* We assume that cl is not inverted */
722STATIC void
097eb12c 723S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 724{
653099ff
GS
725 if (or_with->flags & ANYOF_INVERT) {
726 /* We do not use
727 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
728 * <= (B1 | !B2) | (CL1 | !CL2)
729 * which is wasteful if CL2 is small, but we ignore CL2:
730 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
731 * XXXX Can we handle case-fold? Unclear:
732 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
733 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
734 */
735 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
736 && !(or_with->flags & ANYOF_FOLD)
737 && !(cl->flags & ANYOF_FOLD) ) {
738 int i;
739
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] |= ~or_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise */
743 else {
830247a4 744 cl_anything(pRExC_state, cl);
653099ff
GS
745 }
746 } else {
747 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 749 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
750 || (cl->flags & ANYOF_FOLD)) ) {
751 int i;
752
753 /* OR char bitmap and class bitmap separately */
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] |= or_with->bitmap[i];
756 if (or_with->flags & ANYOF_CLASS) {
757 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
758 cl->classflags[i] |= or_with->classflags[i];
759 cl->flags |= ANYOF_CLASS;
760 }
761 }
762 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 763 cl_anything(pRExC_state, cl);
653099ff
GS
764 }
765 }
766 if (or_with->flags & ANYOF_EOS)
767 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
768
769 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
770 ARG(cl) != ARG(or_with)) {
771 cl->flags |= ANYOF_UNICODE_ALL;
772 cl->flags &= ~ANYOF_UNICODE;
773 }
774 if (or_with->flags & ANYOF_UNICODE_ALL) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
653099ff
GS
778}
779
a3621e74
YO
780#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784
3dab1dad
YO
785
786#ifdef DEBUGGING
07be1b83 787/*
3dab1dad
YO
788 dump_trie(trie)
789 dump_trie_interim_list(trie,next_alloc)
790 dump_trie_interim_table(trie,next_alloc)
791
792 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
793 The _interim_ variants are used for debugging the interim
794 tables that are used to generate the final compressed
795 representation which is what dump_trie expects.
796
3dab1dad
YO
797 Part of the reason for their existance is to provide a form
798 of documentation as to how the different representations function.
07be1b83
YO
799
800*/
3dab1dad
YO
801
802/*
803 dump_trie(trie)
804 Dumps the final compressed table form of the trie to Perl_debug_log.
805 Used for debugging make_trie().
806*/
807
808STATIC void
809S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
810{
811 U32 state;
ab3bbdeb
YO
812 SV *sv=sv_newmortal();
813 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
814 GET_RE_DEBUG_FLAGS_DECL;
815
ab3bbdeb 816
3dab1dad
YO
817 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
818 (int)depth * 2 + 2,"",
819 "Match","Base","Ofs" );
820
821 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 822 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 823 if ( tmp ) {
ab3bbdeb
YO
824 PerlIO_printf( Perl_debug_log, "%*s",
825 colwidth,
ddc5bc0f 826 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
827 PL_colors[0], PL_colors[1],
828 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
829 PERL_PV_ESCAPE_FIRSTCHAR
830 )
831 );
3dab1dad
YO
832 }
833 }
834 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
835 (int)depth * 2 + 2,"");
836
837 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 838 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
839 PerlIO_printf( Perl_debug_log, "\n");
840
1e2e3d02 841 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 842 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
843
844 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
845
846 if ( trie->states[ state ].wordnum ) {
847 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
848 } else {
849 PerlIO_printf( Perl_debug_log, "%6s", "" );
850 }
851
852 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
853
854 if ( base ) {
855 U32 ofs = 0;
856
857 while( ( base + ofs < trie->uniquecharcount ) ||
858 ( base + ofs - trie->uniquecharcount < trie->lasttrans
859 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
860 ofs++;
861
862 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
863
864 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
865 if ( ( base + ofs >= trie->uniquecharcount ) &&
866 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
867 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
868 {
ab3bbdeb
YO
869 PerlIO_printf( Perl_debug_log, "%*"UVXf,
870 colwidth,
3dab1dad
YO
871 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
872 } else {
ab3bbdeb 873 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
874 }
875 }
876
877 PerlIO_printf( Perl_debug_log, "]");
878
879 }
880 PerlIO_printf( Perl_debug_log, "\n" );
881 }
882}
883/*
884 dump_trie_interim_list(trie,next_alloc)
885 Dumps a fully constructed but uncompressed trie in list form.
886 List tries normally only are used for construction when the number of
887 possible chars (trie->uniquecharcount) is very high.
888 Used for debugging make_trie().
889*/
890STATIC void
891S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
892{
893 U32 state;
ab3bbdeb
YO
894 SV *sv=sv_newmortal();
895 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
896 GET_RE_DEBUG_FLAGS_DECL;
897 /* print out the table precompression. */
ab3bbdeb
YO
898 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
899 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
900 "------:-----+-----------------\n" );
3dab1dad
YO
901
902 for( state=1 ; state < next_alloc ; state ++ ) {
903 U16 charid;
904
ab3bbdeb 905 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
906 (int)depth * 2 + 2,"", (UV)state );
907 if ( ! trie->states[ state ].wordnum ) {
908 PerlIO_printf( Perl_debug_log, "%5s| ","");
909 } else {
910 PerlIO_printf( Perl_debug_log, "W%4x| ",
911 trie->states[ state ].wordnum
912 );
913 }
914 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 915 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
916 if ( tmp ) {
917 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
918 colwidth,
ddc5bc0f 919 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
920 PL_colors[0], PL_colors[1],
921 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
922 PERL_PV_ESCAPE_FIRSTCHAR
923 ) ,
1e2e3d02
YO
924 TRIE_LIST_ITEM(state,charid).forid,
925 (UV)TRIE_LIST_ITEM(state,charid).newstate
926 );
927 if (!(charid % 10))
664e119d
RGS
928 PerlIO_printf(Perl_debug_log, "\n%*s| ",
929 (int)((depth * 2) + 14), "");
1e2e3d02 930 }
ab3bbdeb
YO
931 }
932 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
933 }
934}
935
936/*
937 dump_trie_interim_table(trie,next_alloc)
938 Dumps a fully constructed but uncompressed trie in table form.
939 This is the normal DFA style state transition table, with a few
940 twists to facilitate compression later.
941 Used for debugging make_trie().
942*/
943STATIC void
944S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
945{
946 U32 state;
947 U16 charid;
ab3bbdeb
YO
948 SV *sv=sv_newmortal();
949 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
950 GET_RE_DEBUG_FLAGS_DECL;
951
952 /*
953 print out the table precompression so that we can do a visual check
954 that they are identical.
955 */
956
957 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
958
959 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 960 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 961 if ( tmp ) {
ab3bbdeb
YO
962 PerlIO_printf( Perl_debug_log, "%*s",
963 colwidth,
ddc5bc0f 964 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
965 PL_colors[0], PL_colors[1],
966 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
967 PERL_PV_ESCAPE_FIRSTCHAR
968 )
969 );
3dab1dad
YO
970 }
971 }
972
973 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
974
975 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 976 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
977 }
978
979 PerlIO_printf( Perl_debug_log, "\n" );
980
981 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
982
983 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
984 (int)depth * 2 + 2,"",
985 (UV)TRIE_NODENUM( state ) );
986
987 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
988 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
989 if (v)
990 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
991 else
992 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
993 }
994 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
995 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
996 } else {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
998 trie->states[ TRIE_NODENUM( state ) ].wordnum );
999 }
1000 }
07be1b83 1001}
3dab1dad
YO
1002
1003#endif
1004
786e8c11
YO
1005/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1006 startbranch: the first branch in the whole branch sequence
1007 first : start branch of sequence of branch-exact nodes.
1008 May be the same as startbranch
1009 last : Thing following the last branch.
1010 May be the same as tail.
1011 tail : item following the branch sequence
1012 count : words in the sequence
1013 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1014 depth : indent depth
3dab1dad 1015
786e8c11 1016Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1017
786e8c11
YO
1018A trie is an N'ary tree where the branches are determined by digital
1019decomposition of the key. IE, at the root node you look up the 1st character and
1020follow that branch repeat until you find the end of the branches. Nodes can be
1021marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1022
786e8c11 1023 /he|she|his|hers/
72f13be8 1024
786e8c11
YO
1025would convert into the following structure. Numbers represent states, letters
1026following numbers represent valid transitions on the letter from that state, if
1027the number is in square brackets it represents an accepting state, otherwise it
1028will be in parenthesis.
07be1b83 1029
786e8c11
YO
1030 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1031 | |
1032 | (2)
1033 | |
1034 (1) +-i->(6)-+-s->[7]
1035 |
1036 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1037
786e8c11
YO
1038 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1039
1040This shows that when matching against the string 'hers' we will begin at state 1
1041read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1042then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1043is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1044single traverse. We store a mapping from accepting to state to which word was
1045matched, and then when we have multiple possibilities we try to complete the
1046rest of the regex in the order in which they occured in the alternation.
1047
1048The only prior NFA like behaviour that would be changed by the TRIE support is
1049the silent ignoring of duplicate alternations which are of the form:
1050
1051 / (DUPE|DUPE) X? (?{ ... }) Y /x
1052
1053Thus EVAL blocks follwing a trie may be called a different number of times with
1054and without the optimisation. With the optimisations dupes will be silently
1055ignored. This inconsistant behaviour of EVAL type nodes is well established as
1056the following demonstrates:
1057
1058 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1059
1060which prints out 'word' three times, but
1061
1062 'words'=~/(word|word|word)(?{ print $1 })S/
1063
1064which doesnt print it out at all. This is due to other optimisations kicking in.
1065
1066Example of what happens on a structural level:
1067
1068The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1069
1070 1: CURLYM[1] {1,32767}(18)
1071 5: BRANCH(8)
1072 6: EXACT <ac>(16)
1073 8: BRANCH(11)
1074 9: EXACT <ad>(16)
1075 11: BRANCH(14)
1076 12: EXACT <ab>(16)
1077 16: SUCCEED(0)
1078 17: NOTHING(18)
1079 18: END(0)
1080
1081This would be optimizable with startbranch=5, first=5, last=16, tail=16
1082and should turn into:
1083
1084 1: CURLYM[1] {1,32767}(18)
1085 5: TRIE(16)
1086 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1087 <ac>
1088 <ad>
1089 <ab>
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094Cases where tail != last would be like /(?foo|bar)baz/:
1095
1096 1: BRANCH(4)
1097 2: EXACT <foo>(8)
1098 4: BRANCH(7)
1099 5: EXACT <bar>(8)
1100 7: TAIL(8)
1101 8: EXACT <baz>(10)
1102 10: END(0)
1103
1104which would be optimizable with startbranch=1, first=1, last=7, tail=8
1105and would end up looking like:
1106
1107 1: TRIE(8)
1108 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1109 <foo>
1110 <bar>
1111 7: TAIL(8)
1112 8: EXACT <baz>(10)
1113 10: END(0)
1114
1115 d = uvuni_to_utf8_flags(d, uv, 0);
1116
1117is the recommended Unicode-aware way of saying
1118
1119 *(d++) = uv;
1120*/
1121
1e2e3d02 1122#define TRIE_STORE_REVCHAR \
786e8c11 1123 STMT_START { \
1e2e3d02 1124 SV *tmp = newSVpvs(""); \
786e8c11 1125 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1126 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1127 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1128 } STMT_END
1129
1130#define TRIE_READ_CHAR STMT_START { \
1131 wordlen++; \
1132 if ( UTF ) { \
1133 if ( folder ) { \
1134 if ( foldlen > 0 ) { \
1135 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1136 foldlen -= len; \
1137 scan += len; \
1138 len = 0; \
1139 } else { \
1140 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1141 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1142 foldlen -= UNISKIP( uvc ); \
1143 scan = foldbuf + UNISKIP( uvc ); \
1144 } \
1145 } else { \
1146 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1147 } \
1148 } else { \
1149 uvc = (U32)*uc; \
1150 len = 1; \
1151 } \
1152} STMT_END
1153
1154
1155
1156#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1157 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1158 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1159 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1160 } \
1161 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1162 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1163 TRIE_LIST_CUR( state )++; \
1164} STMT_END
07be1b83 1165
786e8c11
YO
1166#define TRIE_LIST_NEW(state) STMT_START { \
1167 Newxz( trie->states[ state ].trans.list, \
1168 4, reg_trie_trans_le ); \
1169 TRIE_LIST_CUR( state ) = 1; \
1170 TRIE_LIST_LEN( state ) = 4; \
1171} STMT_END
07be1b83 1172
786e8c11
YO
1173#define TRIE_HANDLE_WORD(state) STMT_START { \
1174 U16 dupe= trie->states[ state ].wordnum; \
1175 regnode * const noper_next = regnext( noper ); \
1176 \
1177 if (trie->wordlen) \
1178 trie->wordlen[ curword ] = wordlen; \
1179 DEBUG_r({ \
1180 /* store the word for dumping */ \
1181 SV* tmp; \
1182 if (OP(noper) != NOTHING) \
1183 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1184 else \
1185 tmp = newSVpvn( "", 0 ); \
1186 if ( UTF ) SvUTF8_on( tmp ); \
1187 av_push( trie->words, tmp ); \
1188 }); \
1189 \
1190 curword++; \
1191 \
1192 if ( noper_next < tail ) { \
1193 if (!trie->jump) \
1194 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1195 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1196 if (!jumper) \
1197 jumper = noper_next; \
1198 if (!nextbranch) \
1199 nextbranch= regnext(cur); \
1200 } \
1201 \
1202 if ( dupe ) { \
1203 /* So it's a dupe. This means we need to maintain a */\
1204 /* linked-list from the first to the next. */\
1205 /* we only allocate the nextword buffer when there */\
1206 /* a dupe, so first time we have to do the allocation */\
1207 if (!trie->nextword) \
1208 Newxz( trie->nextword, word_count + 1, U16); \
1209 while ( trie->nextword[dupe] ) \
1210 dupe= trie->nextword[dupe]; \
1211 trie->nextword[dupe]= curword; \
1212 } else { \
1213 /* we haven't inserted this word yet. */ \
1214 trie->states[ state ].wordnum = curword; \
1215 } \
1216} STMT_END
07be1b83 1217
3dab1dad 1218
786e8c11
YO
1219#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1220 ( ( base + charid >= ucharcount \
1221 && base + charid < ubound \
1222 && state == trie->trans[ base - ucharcount + charid ].check \
1223 && trie->trans[ base - ucharcount + charid ].next ) \
1224 ? trie->trans[ base - ucharcount + charid ].next \
1225 : ( state==1 ? special : 0 ) \
1226 )
3dab1dad 1227
786e8c11
YO
1228#define MADE_TRIE 1
1229#define MADE_JUMP_TRIE 2
1230#define MADE_EXACT_TRIE 4
3dab1dad 1231
a3621e74 1232STATIC I32
786e8c11 1233S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1234{
27da23d5 1235 dVAR;
a3621e74
YO
1236 /* first pass, loop through and scan words */
1237 reg_trie_data *trie;
1238 regnode *cur;
9f7f3913 1239 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1240 STRLEN len = 0;
1241 UV uvc = 0;
1242 U16 curword = 0;
1243 U32 next_alloc = 0;
786e8c11
YO
1244 regnode *jumper = NULL;
1245 regnode *nextbranch = NULL;
7f69552c 1246 regnode *convert = NULL;
a3621e74 1247 /* we just use folder as a flag in utf8 */
e1ec3a88 1248 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1249 ? PL_fold
1250 : ( flags == EXACTFL
1251 ? PL_fold_locale
1252 : NULL
1253 )
1254 );
1255
e1ec3a88 1256 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1257 SV *re_trie_maxbuff;
3dab1dad
YO
1258#ifndef DEBUGGING
1259 /* these are only used during construction but are useful during
8e11feef 1260 * debugging so we store them in the struct when debugging.
8e11feef 1261 */
3dab1dad 1262 STRLEN trie_charcount=0;
3dab1dad
YO
1263 AV *trie_revcharmap;
1264#endif
a3621e74 1265 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1266#ifndef DEBUGGING
1267 PERL_UNUSED_ARG(depth);
1268#endif
a3621e74 1269
a02a5408 1270 Newxz( trie, 1, reg_trie_data );
a3621e74 1271 trie->refcount = 1;
3dab1dad 1272 trie->startstate = 1;
786e8c11 1273 trie->wordcount = word_count;
a3621e74 1274 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1275 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1276 if (!(UTF && folder))
1277 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1278 DEBUG_r({
1279 trie->words = newAV();
a3621e74 1280 });
3dab1dad 1281 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1282
0111c4fd 1283 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1284 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1285 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1286 }
3dab1dad
YO
1287 DEBUG_OPTIMISE_r({
1288 PerlIO_printf( Perl_debug_log,
786e8c11 1289 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1290 (int)depth * 2 + 2, "",
1291 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1292 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1293 (int)depth);
3dab1dad 1294 });
7f69552c
YO
1295
1296 /* Find the node we are going to overwrite */
1297 if ( first == startbranch && OP( last ) != BRANCH ) {
1298 /* whole branch chain */
1299 convert = first;
1300 } else {
1301 /* branch sub-chain */
1302 convert = NEXTOPER( first );
1303 }
1304
a3621e74
YO
1305 /* -- First loop and Setup --
1306
1307 We first traverse the branches and scan each word to determine if it
1308 contains widechars, and how many unique chars there are, this is
1309 important as we have to build a table with at least as many columns as we
1310 have unique chars.
1311
1312 We use an array of integers to represent the character codes 0..255
1313 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1314 native representation of the character value as the key and IV's for the
1315 coded index.
1316
1317 *TODO* If we keep track of how many times each character is used we can
1318 remap the columns so that the table compression later on is more
1319 efficient in terms of memory by ensuring most common value is in the
1320 middle and the least common are on the outside. IMO this would be better
1321 than a most to least common mapping as theres a decent chance the most
1322 common letter will share a node with the least common, meaning the node
1323 will not be compressable. With a middle is most common approach the worst
1324 case is when we have the least common nodes twice.
1325
1326 */
1327
a3621e74 1328 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1329 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1330 const U8 *uc = (U8*)STRING( noper );
a28509cc 1331 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1332 STRLEN foldlen = 0;
1333 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1334 const U8 *scan = (U8*)NULL;
07be1b83 1335 U32 wordlen = 0; /* required init */
3dab1dad 1336 STRLEN chars=0;
a3621e74 1337
3dab1dad
YO
1338 if (OP(noper) == NOTHING) {
1339 trie->minlen= 0;
1340 continue;
1341 }
1342 if (trie->bitmap) {
1343 TRIE_BITMAP_SET(trie,*uc);
1344 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1345 }
a3621e74 1346 for ( ; uc < e ; uc += len ) {
3dab1dad 1347 TRIE_CHARCOUNT(trie)++;
a3621e74 1348 TRIE_READ_CHAR;
3dab1dad 1349 chars++;
a3621e74
YO
1350 if ( uvc < 256 ) {
1351 if ( !trie->charmap[ uvc ] ) {
1352 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1353 if ( folder )
1354 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1355 TRIE_STORE_REVCHAR;
a3621e74
YO
1356 }
1357 } else {
1358 SV** svpp;
1359 if ( !trie->widecharmap )
1360 trie->widecharmap = newHV();
1361
1362 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1363
1364 if ( !svpp )
e4584336 1365 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1366
1367 if ( !SvTRUE( *svpp ) ) {
1368 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 }
1372 }
3dab1dad
YO
1373 if( cur == first ) {
1374 trie->minlen=chars;
1375 trie->maxlen=chars;
1376 } else if (chars < trie->minlen) {
1377 trie->minlen=chars;
1378 } else if (chars > trie->maxlen) {
1379 trie->maxlen=chars;
1380 }
1381
a3621e74
YO
1382 } /* end first pass */
1383 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1384 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1385 (int)depth * 2 + 2,"",
85c3142d 1386 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1387 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1388 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1389 );
786e8c11 1390 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1391
1392 /*
1393 We now know what we are dealing with in terms of unique chars and
1394 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1395 representation using a flat table will take. If it's over a reasonable
1396 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1397 conservative but potentially much slower representation using an array
1398 of lists.
1399
1400 At the end we convert both representations into the same compressed
1401 form that will be used in regexec.c for matching with. The latter
1402 is a form that cannot be used to construct with but has memory
1403 properties similar to the list form and access properties similar
1404 to the table form making it both suitable for fast searches and
1405 small enough that its feasable to store for the duration of a program.
1406
1407 See the comment in the code where the compressed table is produced
1408 inplace from the flat tabe representation for an explanation of how
1409 the compression works.
1410
1411 */
1412
1413
3dab1dad 1414 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1415 /*
1416 Second Pass -- Array Of Lists Representation
1417
1418 Each state will be represented by a list of charid:state records
1419 (reg_trie_trans_le) the first such element holds the CUR and LEN
1420 points of the allocated array. (See defines above).
1421
1422 We build the initial structure using the lists, and then convert
1423 it into the compressed table form which allows faster lookups
1424 (but cant be modified once converted).
a3621e74
YO
1425 */
1426
a3621e74
YO
1427 STRLEN transcount = 1;
1428
1e2e3d02
YO
1429 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1430 "%*sCompiling trie using list compiler\n",
1431 (int)depth * 2 + 2, ""));
1432
3dab1dad 1433 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1434 TRIE_LIST_NEW(1);
1435 next_alloc = 2;
1436
1437 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1438
c445ea15
AL
1439 regnode * const noper = NEXTOPER( cur );
1440 U8 *uc = (U8*)STRING( noper );
1441 const U8 * const e = uc + STR_LEN( noper );
1442 U32 state = 1; /* required init */
1443 U16 charid = 0; /* sanity init */
1444 U8 *scan = (U8*)NULL; /* sanity init */
1445 STRLEN foldlen = 0; /* required init */
07be1b83 1446 U32 wordlen = 0; /* required init */
c445ea15
AL
1447 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1448
3dab1dad 1449 if (OP(noper) != NOTHING) {
786e8c11 1450 for ( ; uc < e ; uc += len ) {
c445ea15 1451
786e8c11 1452 TRIE_READ_CHAR;
c445ea15 1453
786e8c11
YO
1454 if ( uvc < 256 ) {
1455 charid = trie->charmap[ uvc ];
c445ea15 1456 } else {
786e8c11
YO
1457 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1458 if ( !svpp ) {
1459 charid = 0;
1460 } else {
1461 charid=(U16)SvIV( *svpp );
1462 }
c445ea15 1463 }
786e8c11
YO
1464 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1465 if ( charid ) {
a3621e74 1466
786e8c11
YO
1467 U16 check;
1468 U32 newstate = 0;
a3621e74 1469
786e8c11
YO
1470 charid--;
1471 if ( !trie->states[ state ].trans.list ) {
1472 TRIE_LIST_NEW( state );
c445ea15 1473 }
786e8c11
YO
1474 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1475 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1476 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1477 break;
1478 }
1479 }
1480 if ( ! newstate ) {
1481 newstate = next_alloc++;
1482 TRIE_LIST_PUSH( state, charid, newstate );
1483 transcount++;
1484 }
1485 state = newstate;
1486 } else {
1487 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1488 }
a28509cc 1489 }
c445ea15 1490 }
3dab1dad 1491 TRIE_HANDLE_WORD(state);
a3621e74
YO
1492
1493 } /* end second pass */
1494
1e2e3d02
YO
1495 /* next alloc is the NEXT state to be allocated */
1496 trie->statecount = next_alloc;
a3621e74
YO
1497 Renew( trie->states, next_alloc, reg_trie_state );
1498
3dab1dad
YO
1499 /* and now dump it out before we compress it */
1500 DEBUG_TRIE_COMPILE_MORE_r(
1501 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1502 );
a3621e74 1503
a02a5408 1504 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1505 {
1506 U32 state;
a3621e74
YO
1507 U32 tp = 0;
1508 U32 zp = 0;
1509
1510
1511 for( state=1 ; state < next_alloc ; state ++ ) {
1512 U32 base=0;
1513
1514 /*
1515 DEBUG_TRIE_COMPILE_MORE_r(
1516 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1517 );
1518 */
1519
1520 if (trie->states[state].trans.list) {
1521 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1522 U16 maxid=minid;
a28509cc 1523 U16 idx;
a3621e74
YO
1524
1525 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1526 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1527 if ( forid < minid ) {
1528 minid=forid;
1529 } else if ( forid > maxid ) {
1530 maxid=forid;
1531 }
a3621e74
YO
1532 }
1533 if ( transcount < tp + maxid - minid + 1) {
1534 transcount *= 2;
1535 Renew( trie->trans, transcount, reg_trie_trans );
1536 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1537 }
1538 base = trie->uniquecharcount + tp - minid;
1539 if ( maxid == minid ) {
1540 U32 set = 0;
1541 for ( ; zp < tp ; zp++ ) {
1542 if ( ! trie->trans[ zp ].next ) {
1543 base = trie->uniquecharcount + zp - minid;
1544 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1545 trie->trans[ zp ].check = state;
1546 set = 1;
1547 break;
1548 }
1549 }
1550 if ( !set ) {
1551 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1552 trie->trans[ tp ].check = state;
1553 tp++;
1554 zp = tp;
1555 }
1556 } else {
1557 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1558 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1559 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1560 trie->trans[ tid ].check = state;
1561 }
1562 tp += ( maxid - minid + 1 );
1563 }
1564 Safefree(trie->states[ state ].trans.list);
1565 }
1566 /*
1567 DEBUG_TRIE_COMPILE_MORE_r(
1568 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1569 );
1570 */
1571 trie->states[ state ].trans.base=base;
1572 }
cc601c31 1573 trie->lasttrans = tp + 1;
a3621e74
YO
1574 }
1575 } else {
1576 /*
1577 Second Pass -- Flat Table Representation.
1578
1579 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1580 We know that we will need Charcount+1 trans at most to store the data
1581 (one row per char at worst case) So we preallocate both structures
1582 assuming worst case.
1583
1584 We then construct the trie using only the .next slots of the entry
1585 structs.
1586
1587 We use the .check field of the first entry of the node temporarily to
1588 make compression both faster and easier by keeping track of how many non
1589 zero fields are in the node.
1590
1591 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1592 transition.
1593
1594 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1595 number representing the first entry of the node, and state as a
1596 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1597 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1598 are 2 entrys per node. eg:
1599
1600 A B A B
1601 1. 2 4 1. 3 7
1602 2. 0 3 3. 0 5
1603 3. 0 0 5. 0 0
1604 4. 0 0 7. 0 0
1605
1606 The table is internally in the right hand, idx form. However as we also
1607 have to deal with the states array which is indexed by nodenum we have to
1608 use TRIE_NODENUM() to convert.
1609
1610 */
1e2e3d02
YO
1611 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1612 "%*sCompiling trie using table compiler\n",
1613 (int)depth * 2 + 2, ""));
3dab1dad
YO
1614
1615 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1616 reg_trie_trans );
3dab1dad 1617 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1618 next_alloc = trie->uniquecharcount + 1;
1619
3dab1dad 1620
a3621e74
YO
1621 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1622
c445ea15 1623 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1624 const U8 *uc = (U8*)STRING( noper );
1625 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1626
1627 U32 state = 1; /* required init */
1628
1629 U16 charid = 0; /* sanity init */
1630 U32 accept_state = 0; /* sanity init */
1631 U8 *scan = (U8*)NULL; /* sanity init */
1632
1633 STRLEN foldlen = 0; /* required init */
07be1b83 1634 U32 wordlen = 0; /* required init */
a3621e74
YO
1635 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1636
3dab1dad 1637 if ( OP(noper) != NOTHING ) {
786e8c11 1638 for ( ; uc < e ; uc += len ) {
a3621e74 1639
786e8c11 1640 TRIE_READ_CHAR;
a3621e74 1641
786e8c11
YO
1642 if ( uvc < 256 ) {
1643 charid = trie->charmap[ uvc ];
1644 } else {
1645 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1646 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1647 }
786e8c11
YO
1648 if ( charid ) {
1649 charid--;
1650 if ( !trie->trans[ state + charid ].next ) {
1651 trie->trans[ state + charid ].next = next_alloc;
1652 trie->trans[ state ].check++;
1653 next_alloc += trie->uniquecharcount;
1654 }
1655 state = trie->trans[ state + charid ].next;
1656 } else {
1657 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1658 }
1659 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1660 }
a3621e74 1661 }
3dab1dad
YO
1662 accept_state = TRIE_NODENUM( state );
1663 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1664
1665 } /* end second pass */
1666
3dab1dad
YO
1667 /* and now dump it out before we compress it */
1668 DEBUG_TRIE_COMPILE_MORE_r(
1669 dump_trie_interim_table(trie,next_alloc,depth+1)
1670 );
a3621e74 1671
a3621e74
YO
1672 {
1673 /*
1674 * Inplace compress the table.*
1675
1676 For sparse data sets the table constructed by the trie algorithm will
1677 be mostly 0/FAIL transitions or to put it another way mostly empty.
1678 (Note that leaf nodes will not contain any transitions.)
1679
1680 This algorithm compresses the tables by eliminating most such
1681 transitions, at the cost of a modest bit of extra work during lookup:
1682
1683 - Each states[] entry contains a .base field which indicates the
1684 index in the state[] array wheres its transition data is stored.
1685
1686 - If .base is 0 there are no valid transitions from that node.
1687
1688 - If .base is nonzero then charid is added to it to find an entry in
1689 the trans array.
1690
1691 -If trans[states[state].base+charid].check!=state then the
1692 transition is taken to be a 0/Fail transition. Thus if there are fail
1693 transitions at the front of the node then the .base offset will point
1694 somewhere inside the previous nodes data (or maybe even into a node
1695 even earlier), but the .check field determines if the transition is
1696 valid.
1697
786e8c11 1698 XXX - wrong maybe?
a3621e74
YO
1699 The following process inplace converts the table to the compressed
1700 table: We first do not compress the root node 1,and mark its all its
1701 .check pointers as 1 and set its .base pointer as 1 as well. This
1702 allows to do a DFA construction from the compressed table later, and
1703 ensures that any .base pointers we calculate later are greater than
1704 0.
1705
1706 - We set 'pos' to indicate the first entry of the second node.
1707
1708 - We then iterate over the columns of the node, finding the first and
1709 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1710 and set the .check pointers accordingly, and advance pos
1711 appropriately and repreat for the next node. Note that when we copy
1712 the next pointers we have to convert them from the original
1713 NODEIDX form to NODENUM form as the former is not valid post
1714 compression.
1715
1716 - If a node has no transitions used we mark its base as 0 and do not
1717 advance the pos pointer.
1718
1719 - If a node only has one transition we use a second pointer into the
1720 structure to fill in allocated fail transitions from other states.
1721 This pointer is independent of the main pointer and scans forward
1722 looking for null transitions that are allocated to a state. When it
1723 finds one it writes the single transition into the "hole". If the
786e8c11 1724 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1725
1726 - Once compressed we can Renew/realloc the structures to release the
1727 excess space.
1728
1729 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1730 specifically Fig 3.47 and the associated pseudocode.
1731
1732 demq
1733 */
a3b680e6 1734 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1735 U32 state, charid;
a3621e74 1736 U32 pos = 0, zp=0;
1e2e3d02 1737 trie->statecount = laststate;
a3621e74
YO
1738
1739 for ( state = 1 ; state < laststate ; state++ ) {
1740 U8 flag = 0;
a28509cc
AL
1741 const U32 stateidx = TRIE_NODEIDX( state );
1742 const U32 o_used = trie->trans[ stateidx ].check;
1743 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1744 trie->trans[ stateidx ].check = 0;
1745
1746 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1747 if ( flag || trie->trans[ stateidx + charid ].next ) {
1748 if ( trie->trans[ stateidx + charid ].next ) {
1749 if (o_used == 1) {
1750 for ( ; zp < pos ; zp++ ) {
1751 if ( ! trie->trans[ zp ].next ) {
1752 break;
1753 }
1754 }
1755 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1756 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1757 trie->trans[ zp ].check = state;
1758 if ( ++zp > pos ) pos = zp;
1759 break;
1760 }
1761 used--;
1762 }
1763 if ( !flag ) {
1764 flag = 1;
1765 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1766 }
1767 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ pos ].check = state;
1769 pos++;
1770 }
1771 }
1772 }
cc601c31 1773 trie->lasttrans = pos + 1;
1e2e3d02 1774 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1775 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1776 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1777 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1778 (int)depth * 2 + 2,"",
1779 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1780 (IV)next_alloc,
1781 (IV)pos,
a3621e74
YO
1782 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1783 );
1784
1785 } /* end table compress */
1786 }
1e2e3d02
YO
1787 DEBUG_TRIE_COMPILE_MORE_r(
1788 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1789 (int)depth * 2 + 2, "",
1790 (UV)trie->statecount,
1791 (UV)trie->lasttrans)
1792 );
cc601c31
YO
1793 /* resize the trans array to remove unused space */
1794 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1795
3dab1dad
YO
1796 /* and now dump out the compressed format */
1797 DEBUG_TRIE_COMPILE_r(
1798 dump_trie(trie,depth+1)
1799 );
07be1b83 1800
3dab1dad 1801 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1802 U8 nodetype =(U8)(flags & 0xFF);
1803 char *str=NULL;
786e8c11 1804
07be1b83 1805#ifdef DEBUGGING
e62cc96a 1806 regnode *optimize = NULL;
b57a0404
JH
1807 U32 mjd_offset = 0;
1808 U32 mjd_nodelen = 0;
07be1b83 1809#endif
a3621e74 1810 /*
3dab1dad
YO
1811 This means we convert either the first branch or the first Exact,
1812 depending on whether the thing following (in 'last') is a branch
1813 or not and whther first is the startbranch (ie is it a sub part of
1814 the alternation or is it the whole thing.)
1815 Assuming its a sub part we conver the EXACT otherwise we convert
1816 the whole branch sequence, including the first.
a3621e74 1817 */
3dab1dad 1818 /* Find the node we are going to overwrite */
7f69552c 1819 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1820 /* branch sub-chain */
3dab1dad 1821 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1822 DEBUG_r({
1823 mjd_offset= Node_Offset((convert));
1824 mjd_nodelen= Node_Length((convert));
1825 });
7f69552c
YO
1826 /* whole branch chain */
1827 } else {
1828 DEBUG_r({
1829 const regnode *nop = NEXTOPER( convert );
1830 mjd_offset= Node_Offset((nop));
1831 mjd_nodelen= Node_Length((nop));
1832 });
07be1b83 1833 }
7f69552c 1834
07be1b83
YO
1835 DEBUG_OPTIMISE_r(
1836 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1837 (int)depth * 2 + 2, "",
786e8c11 1838 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1839 );
a3621e74 1840
3dab1dad
YO
1841 /* But first we check to see if there is a common prefix we can
1842 split out as an EXACT and put in front of the TRIE node. */
1843 trie->startstate= 1;
786e8c11 1844 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1845 U32 state;
1e2e3d02 1846 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1847 U32 ofs = 0;
8e11feef
RGS
1848 I32 idx = -1;
1849 U32 count = 0;
1850 const U32 base = trie->states[ state ].trans.base;
a3621e74 1851
3dab1dad 1852 if ( trie->states[state].wordnum )
8e11feef 1853 count = 1;
a3621e74 1854
8e11feef 1855 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1856 if ( ( base + ofs >= trie->uniquecharcount ) &&
1857 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1858 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1859 {
3dab1dad 1860 if ( ++count > 1 ) {
8e11feef 1861 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1862 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1863 if ( state == 1 ) break;
3dab1dad
YO
1864 if ( count == 2 ) {
1865 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1866 DEBUG_OPTIMISE_r(
8e11feef
RGS
1867 PerlIO_printf(Perl_debug_log,
1868 "%*sNew Start State=%"UVuf" Class: [",
1869 (int)depth * 2 + 2, "",
786e8c11 1870 (UV)state));
be8e71aa
YO
1871 if (idx >= 0) {
1872 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1873 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1874
3dab1dad 1875 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1876 if ( folder )
1877 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1878 DEBUG_OPTIMISE_r(
07be1b83 1879 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1880 );
8e11feef
RGS
1881 }
1882 }
1883 TRIE_BITMAP_SET(trie,*ch);
1884 if ( folder )
1885 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1886 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1887 }
1888 idx = ofs;
1889 }
3dab1dad
YO
1890 }
1891 if ( count == 1 ) {
1892 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1893 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1894 DEBUG_OPTIMISE_r(
8e11feef
RGS
1895 PerlIO_printf( Perl_debug_log,
1896 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1897 (int)depth * 2 + 2, "",
786e8c11 1898 (UV)state, (UV)idx, ch)
3dab1dad
YO
1899 );
1900 if ( state==1 ) {
1901 OP( convert ) = nodetype;
1902 str=STRING(convert);
1903 STR_LEN(convert)=0;
1904 }
1905 *str++=*ch;
1906 STR_LEN(convert)++;
a3621e74 1907
8e11feef 1908 } else {
f9049ba1 1909#ifdef DEBUGGING
8e11feef
RGS
1910 if (state>1)
1911 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1912#endif
8e11feef
RGS
1913 break;
1914 }
1915 }
3dab1dad 1916 if (str) {
8e11feef 1917 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1918 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1919 trie->startstate = state;
07be1b83
YO
1920 trie->minlen -= (state - 1);
1921 trie->maxlen -= (state - 1);
1922 DEBUG_r({
1923 regnode *fix = convert;
1924 mjd_nodelen++;
1925 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1926 while( ++fix < n ) {
1927 Set_Node_Offset_Length(fix, 0, 0);
1928 }
1929 });
8e11feef
RGS
1930 if (trie->maxlen) {
1931 convert = n;
1932 } else {
3dab1dad 1933 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1934 DEBUG_r(optimize= n);
3dab1dad
YO
1935 }
1936 }
1937 }
a5ca303d
YO
1938 if (!jumper)
1939 jumper = last;
3dab1dad 1940 if ( trie->maxlen ) {
8e11feef
RGS
1941 NEXT_OFF( convert ) = (U16)(tail - convert);
1942 ARG_SET( convert, data_slot );
786e8c11
YO
1943 /* Store the offset to the first unabsorbed branch in
1944 jump[0], which is otherwise unused by the jump logic.
1945 We use this when dumping a trie and during optimisation. */
1946 if (trie->jump)
7f69552c 1947 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1948
786e8c11
YO
1949 /* XXXX */
1950 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1951 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1952 {
1953 OP( convert ) = TRIEC;
1954 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1955 Safefree(trie->bitmap);
1956 trie->bitmap= NULL;
1957 } else
1958 OP( convert ) = TRIE;
a3621e74 1959
3dab1dad
YO
1960 /* store the type in the flags */
1961 convert->flags = nodetype;
a5ca303d
YO
1962 DEBUG_r({
1963 optimize = convert
1964 + NODE_STEP_REGNODE
1965 + regarglen[ OP( convert ) ];
1966 });
1967 /* XXX We really should free up the resource in trie now,
1968 as we won't use them - (which resources?) dmq */
3dab1dad 1969 }
a3621e74 1970 /* needed for dumping*/
e62cc96a 1971 DEBUG_r(if (optimize) {
07be1b83 1972 regnode *opt = convert;
e62cc96a 1973 while ( ++opt < optimize) {
07be1b83
YO
1974 Set_Node_Offset_Length(opt,0,0);
1975 }
786e8c11
YO
1976 /*
1977 Try to clean up some of the debris left after the
1978 optimisation.
a3621e74 1979 */
786e8c11 1980 while( optimize < jumper ) {
07be1b83 1981 mjd_nodelen += Node_Length((optimize));
a3621e74 1982 OP( optimize ) = OPTIMIZED;
07be1b83 1983 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1984 optimize++;
1985 }
07be1b83 1986 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1987 });
1988 } /* end node insert */
07be1b83 1989#ifndef DEBUGGING
6e8b4190 1990 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1991#endif
786e8c11
YO
1992 return trie->jump
1993 ? MADE_JUMP_TRIE
1994 : trie->startstate>1
1995 ? MADE_EXACT_TRIE
1996 : MADE_TRIE;
1997}
1998
1999STATIC void
2000S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2001{
2002/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2003
2004 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2005 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2006 ISBN 0-201-10088-6
2007
2008 We find the fail state for each state in the trie, this state is the longest proper
2009 suffix of the current states 'word' that is also a proper prefix of another word in our
2010 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2011 the DFA not to have to restart after its tried and failed a word at a given point, it
2012 simply continues as though it had been matching the other word in the first place.
2013 Consider
2014 'abcdgu'=~/abcdefg|cdgu/
2015 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2016 fail, which would bring use to the state representing 'd' in the second word where we would
2017 try 'g' and succeed, prodceding to match 'cdgu'.
2018 */
2019 /* add a fail transition */
2020 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2021 U32 *q;
2022 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2023 const U32 numstates = trie->statecount;
786e8c11
YO
2024 const U32 ubound = trie->lasttrans + ucharcount;
2025 U32 q_read = 0;
2026 U32 q_write = 0;
2027 U32 charid;
2028 U32 base = trie->states[ 1 ].trans.base;
2029 U32 *fail;
2030 reg_ac_data *aho;
2031 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2032 GET_RE_DEBUG_FLAGS_DECL;
2033#ifndef DEBUGGING
2034 PERL_UNUSED_ARG(depth);
2035#endif
2036
2037
2038 ARG_SET( stclass, data_slot );
2039 Newxz( aho, 1, reg_ac_data );
2040 RExC_rx->data->data[ data_slot ] = (void*)aho;
2041 aho->trie=trie;
2042 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2043 numstates * sizeof(reg_trie_state));
786e8c11
YO
2044 Newxz( q, numstates, U32);
2045 Newxz( aho->fail, numstates, U32 );
2046 aho->refcount = 1;
2047 fail = aho->fail;
2048 /* initialize fail[0..1] to be 1 so that we always have
2049 a valid final fail state */
2050 fail[ 0 ] = fail[ 1 ] = 1;
2051
2052 for ( charid = 0; charid < ucharcount ; charid++ ) {
2053 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2054 if ( newstate ) {
2055 q[ q_write ] = newstate;
2056 /* set to point at the root */
2057 fail[ q[ q_write++ ] ]=1;
2058 }
2059 }
2060 while ( q_read < q_write) {
2061 const U32 cur = q[ q_read++ % numstates ];
2062 base = trie->states[ cur ].trans.base;
2063
2064 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2065 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2066 if (ch_state) {
2067 U32 fail_state = cur;
2068 U32 fail_base;
2069 do {
2070 fail_state = fail[ fail_state ];
2071 fail_base = aho->states[ fail_state ].trans.base;
2072 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2073
2074 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2075 fail[ ch_state ] = fail_state;
2076 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2077 {
2078 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2079 }
2080 q[ q_write++ % numstates] = ch_state;
2081 }
2082 }
2083 }
2084 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2085 when we fail in state 1, this allows us to use the
2086 charclass scan to find a valid start char. This is based on the principle
2087 that theres a good chance the string being searched contains lots of stuff
2088 that cant be a start char.
2089 */
2090 fail[ 0 ] = fail[ 1 ] = 0;
2091 DEBUG_TRIE_COMPILE_r({
1e2e3d02
YO
2092 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2093 (int)(depth * 2), "", numstates
2094 );
786e8c11
YO
2095 for( q_read=1; q_read<numstates; q_read++ ) {
2096 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2097 }
2098 PerlIO_printf(Perl_debug_log, "\n");
2099 });
2100 Safefree(q);
2101 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2102}
2103
786e8c11 2104
a3621e74 2105/*
5d1c421c
JH
2106 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2107 * These need to be revisited when a newer toolchain becomes available.
2108 */
2109#if defined(__sparc64__) && defined(__GNUC__)
2110# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2111# undef SPARC64_GCC_WORKAROUND
2112# define SPARC64_GCC_WORKAROUND 1
2113# endif
2114#endif
2115
07be1b83
YO
2116#define DEBUG_PEEP(str,scan,depth) \
2117 DEBUG_OPTIMISE_r({ \
2118 SV * const mysv=sv_newmortal(); \
2119 regnode *Next = regnext(scan); \
2120 regprop(RExC_rx, mysv, scan); \
7f69552c 2121 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2122 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2123 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2124 });
2125
1de06328
YO
2126
2127
2128
2129
07be1b83
YO
2130#define JOIN_EXACT(scan,min,flags) \
2131 if (PL_regkind[OP(scan)] == EXACT) \
2132 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2133
be8e71aa 2134STATIC U32
07be1b83
YO
2135S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2136 /* Merge several consecutive EXACTish nodes into one. */
2137 regnode *n = regnext(scan);
2138 U32 stringok = 1;
2139 regnode *next = scan + NODE_SZ_STR(scan);
2140 U32 merged = 0;
2141 U32 stopnow = 0;
2142#ifdef DEBUGGING
2143 regnode *stop = scan;
72f13be8 2144 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2145#else
d47053eb
RGS
2146 PERL_UNUSED_ARG(depth);
2147#endif
2148#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2149 PERL_UNUSED_ARG(flags);
2150 PERL_UNUSED_ARG(val);
07be1b83 2151#endif
07be1b83
YO
2152 DEBUG_PEEP("join",scan,depth);
2153
2154 /* Skip NOTHING, merge EXACT*. */
2155 while (n &&
2156 ( PL_regkind[OP(n)] == NOTHING ||
2157 (stringok && (OP(n) == OP(scan))))
2158 && NEXT_OFF(n)
2159 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2160
2161 if (OP(n) == TAIL || n > next)
2162 stringok = 0;
2163 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2164 DEBUG_PEEP("skip:",n,depth);
2165 NEXT_OFF(scan) += NEXT_OFF(n);
2166 next = n + NODE_STEP_REGNODE;
2167#ifdef DEBUGGING
2168 if (stringok)
2169 stop = n;
2170#endif
2171 n = regnext(n);
2172 }
2173 else if (stringok) {
786e8c11 2174 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2175 regnode * const nnext = regnext(n);
2176
2177 DEBUG_PEEP("merg",n,depth);
2178
2179 merged++;
2180 if (oldl + STR_LEN(n) > U8_MAX)
2181 break;
2182 NEXT_OFF(scan) += NEXT_OFF(n);
2183 STR_LEN(scan) += STR_LEN(n);
2184 next = n + NODE_SZ_STR(n);
2185 /* Now we can overwrite *n : */
2186 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2187#ifdef DEBUGGING
2188 stop = next - 1;
2189#endif
2190 n = nnext;
2191 if (stopnow) break;
2192 }
2193
d47053eb
RGS
2194#ifdef EXPERIMENTAL_INPLACESCAN
2195 if (flags && !NEXT_OFF(n)) {
2196 DEBUG_PEEP("atch", val, depth);
2197 if (reg_off_by_arg[OP(n)]) {
2198 ARG_SET(n, val - n);
2199 }
2200 else {
2201 NEXT_OFF(n) = val - n;
2202 }
2203 stopnow = 1;
2204 }
07be1b83
YO
2205#endif
2206 }
2207
2208 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2209 /*
2210 Two problematic code points in Unicode casefolding of EXACT nodes:
2211
2212 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2213 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2214
2215 which casefold to
2216
2217 Unicode UTF-8
2218
2219 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2220 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2221
2222 This means that in case-insensitive matching (or "loose matching",
2223 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2224 length of the above casefolded versions) can match a target string
2225 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2226 This would rather mess up the minimum length computation.
2227
2228 What we'll do is to look for the tail four bytes, and then peek
2229 at the preceding two bytes to see whether we need to decrease
2230 the minimum length by four (six minus two).
2231
2232 Thanks to the design of UTF-8, there cannot be false matches:
2233 A sequence of valid UTF-8 bytes cannot be a subsequence of
2234 another valid sequence of UTF-8 bytes.
2235
2236 */
2237 char * const s0 = STRING(scan), *s, *t;
2238 char * const s1 = s0 + STR_LEN(scan) - 1;
2239 char * const s2 = s1 - 4;
e294cc5d
JH
2240#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2241 const char t0[] = "\xaf\x49\xaf\x42";
2242#else
07be1b83 2243 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2244#endif
07be1b83
YO
2245 const char * const t1 = t0 + 3;
2246
2247 for (s = s0 + 2;
2248 s < s2 && (t = ninstr(s, s1, t0, t1));
2249 s = t + 4) {
e294cc5d
JH
2250#ifdef EBCDIC
2251 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2252 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2253#else
07be1b83
YO
2254 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2255 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2256#endif
07be1b83
YO
2257 *min -= 4;
2258 }
2259 }
2260
2261#ifdef DEBUGGING
2262 /* Allow dumping */
2263 n = scan + NODE_SZ_STR(scan);
2264 while (n <= stop) {
2265 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2266 OP(n) = OPTIMIZED;
2267 NEXT_OFF(n) = 0;
2268 }
2269 n++;
2270 }
2271#endif
2272 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2273 return stopnow;
2274}
2275
653099ff
GS
2276/* REx optimizer. Converts nodes into quickier variants "in place".
2277 Finds fixed substrings. */
2278
a0288114 2279/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2280 to the position after last scanned or to NULL. */
2281
40d049e4
YO
2282#define INIT_AND_WITHP \
2283 assert(!and_withp); \
2284 Newx(and_withp,1,struct regnode_charclass_class); \
2285 SAVEFREEPV(and_withp)
07be1b83 2286
76e3520e 2287STATIC I32
40d049e4 2288S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2289 I32 *minlenp, I32 *deltap,
40d049e4
YO
2290 regnode *last,
2291 scan_data_t *data,
2292 I32 stopparen,
2293 U8* recursed,
2294 struct regnode_charclass_class *and_withp,
2295 U32 flags, U32 depth)
c277df42
IZ
2296 /* scanp: Start here (read-write). */
2297 /* deltap: Write maxlen-minlen here. */
2298 /* last: Stop before this one. */
40d049e4
YO
2299 /* data: string data about the pattern */
2300 /* stopparen: treat close N as END */
2301 /* recursed: which subroutines have we recursed into */
2302 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2303{
97aff369 2304 dVAR;
c277df42
IZ
2305 I32 min = 0, pars = 0, code;
2306 regnode *scan = *scanp, *next;
2307 I32 delta = 0;
2308 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2309 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2310 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2311 scan_data_t data_fake;
a3621e74 2312 SV *re_trie_maxbuff = NULL;
786e8c11 2313 regnode *first_non_open = scan;
a3621e74 2314 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2315#ifdef DEBUGGING
40d049e4 2316 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2317#endif
40d049e4 2318
786e8c11 2319 if ( depth == 0 ) {
40d049e4 2320 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2321 first_non_open=regnext(first_non_open);
2322 }
2323
b81d288d 2324
c277df42
IZ
2325 while (scan && OP(scan) != END && scan < last) {
2326 /* Peephole optimizer: */
1de06328 2327 DEBUG_STUDYDATA(data,depth);
07be1b83 2328 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2329 JOIN_EXACT(scan,&min,0);
a3621e74 2330
653099ff
GS
2331 /* Follow the next-chain of the current node and optimize
2332 away all the NOTHINGs from it. */
c277df42 2333 if (OP(scan) != CURLYX) {
a3b680e6 2334 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2335 ? I32_MAX
2336 /* I32 may be smaller than U16 on CRAYs! */
2337 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2338 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2339 int noff;
2340 regnode *n = scan;
b81d288d 2341
c277df42
IZ
2342 /* Skip NOTHING and LONGJMP. */
2343 while ((n = regnext(n))
3dab1dad 2344 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2345 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2346 && off + noff < max)
2347 off += noff;
2348 if (reg_off_by_arg[OP(scan)])
2349 ARG(scan) = off;
b81d288d 2350 else
c277df42
IZ
2351 NEXT_OFF(scan) = off;
2352 }
a3621e74 2353
07be1b83 2354
3dab1dad 2355
653099ff
GS
2356 /* The principal pseudo-switch. Cannot be a switch, since we
2357 look into several different things. */
b81d288d 2358 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2359 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2360 next = regnext(scan);
2361 code = OP(scan);
a3621e74 2362 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2363
2364 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2365 /* NOTE - There is similar code to this block below for handling
2366 TRIE nodes on a re-study. If you change stuff here check there
2367 too. */
c277df42 2368 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2369 struct regnode_charclass_class accum;
d4c19fe8 2370 regnode * const startbranch=scan;
c277df42 2371
653099ff 2372 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2373 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2374 if (flags & SCF_DO_STCLASS)
830247a4 2375 cl_init_zero(pRExC_state, &accum);
a3621e74 2376
c277df42 2377 while (OP(scan) == code) {
830247a4 2378 I32 deltanext, minnext, f = 0, fake;
653099ff 2379 struct regnode_charclass_class this_class;
c277df42
IZ
2380
2381 num++;
2382 data_fake.flags = 0;
40d049e4 2383 if (data) {
2c2d71f5 2384 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2385 data_fake.last_closep = data->last_closep;
2386 }
2387 else
2388 data_fake.last_closep = &fake;
c277df42
IZ
2389 next = regnext(scan);
2390 scan = NEXTOPER(scan);
2391 if (code != BRANCH)
2392 scan = NEXTOPER(scan);
653099ff 2393 if (flags & SCF_DO_STCLASS) {
830247a4 2394 cl_init(pRExC_state, &this_class);
653099ff
GS
2395 data_fake.start_class = &this_class;
2396 f = SCF_DO_STCLASS_AND;
b81d288d 2397 }
e1901655
IZ
2398 if (flags & SCF_WHILEM_VISITED_POS)
2399 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2400
653099ff 2401 /* we suppose the run is continuous, last=next...*/
1de06328 2402 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2403 next, &data_fake,
2404 stopparen, recursed, NULL, f,depth+1);
b81d288d 2405 if (min1 > minnext)
c277df42
IZ
2406 min1 = minnext;
2407 if (max1 < minnext + deltanext)
2408 max1 = minnext + deltanext;
2409 if (deltanext == I32_MAX)
aca2d497 2410 is_inf = is_inf_internal = 1;
c277df42
IZ
2411 scan = next;
2412 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2413 pars++;
3dab1dad
YO
2414 if (data) {
2415 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2416 data->flags |= SF_HAS_EVAL;
2c2d71f5 2417 data->whilem_c = data_fake.whilem_c;
3dab1dad 2418 }
653099ff 2419 if (flags & SCF_DO_STCLASS)
830247a4 2420 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2421 if (code == SUSPEND)
c277df42
IZ
2422 break;
2423 }
2424 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2425 min1 = 0;
2426 if (flags & SCF_DO_SUBSTR) {
2427 data->pos_min += min1;
2428 data->pos_delta += max1 - min1;
2429 if (max1 != min1 || is_inf)
2430 data->longest = &(data->longest_float);
2431 }
2432 min += min1;
2433 delta += max1 - min1;
653099ff 2434 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2435 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2436 if (min1) {
40d049e4 2437 cl_and(data->start_class, and_withp);
653099ff
GS
2438 flags &= ~SCF_DO_STCLASS;
2439 }
2440 }
2441 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2442 if (min1) {
2443 cl_and(data->start_class, &accum);
653099ff 2444 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2445 }
2446 else {
b81d288d 2447 /* Switch to OR mode: cache the old value of
de0c8cb8 2448 * data->start_class */
40d049e4
YO
2449 INIT_AND_WITHP;
2450 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2451 struct regnode_charclass_class);
2452 flags &= ~SCF_DO_STCLASS_AND;
2453 StructCopy(&accum, data->start_class,
2454 struct regnode_charclass_class);
2455 flags |= SCF_DO_STCLASS_OR;
2456 data->start_class->flags |= ANYOF_EOS;
2457 }
653099ff 2458 }
a3621e74 2459
786e8c11 2460 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2461 /* demq.
2462
2463 Assuming this was/is a branch we are dealing with: 'scan' now
2464 points at the item that follows the branch sequence, whatever
2465 it is. We now start at the beginning of the sequence and look
2466 for subsequences of
2467
786e8c11
YO
2468 BRANCH->EXACT=>x1
2469 BRANCH->EXACT=>x2
2470 tail
a3621e74
YO
2471
2472 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2473
2474 If we can find such a subseqence we need to turn the first
2475 element into a trie and then add the subsequent branch exact
2476 strings to the trie.
2477
2478 We have two cases
2479
786e8c11 2480 1. patterns where the whole set of branch can be converted.
a3621e74 2481
786e8c11 2482 2. patterns where only a subset can be converted.
a3621e74
YO
2483
2484 In case 1 we can replace the whole set with a single regop
2485 for the trie. In case 2 we need to keep the start and end
2486 branchs so
2487
2488 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2489 becomes BRANCH TRIE; BRANCH X;
2490
786e8c11
YO
2491 There is an additional case, that being where there is a
2492 common prefix, which gets split out into an EXACT like node
2493 preceding the TRIE node.
2494
2495 If x(1..n)==tail then we can do a simple trie, if not we make
2496 a "jump" trie, such that when we match the appropriate word
2497 we "jump" to the appopriate tail node. Essentailly we turn
2498 a nested if into a case structure of sorts.
a3621e74
YO
2499
2500 */
786e8c11 2501
3dab1dad 2502 int made=0;
0111c4fd
RGS
2503 if (!re_trie_maxbuff) {
2504 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2505 if (!SvIOK(re_trie_maxbuff))
2506 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2507 }
786e8c11 2508 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2509 regnode *cur;
2510 regnode *first = (regnode *)NULL;
2511 regnode *last = (regnode *)NULL;
2512 regnode *tail = scan;
2513 U8 optype = 0;
2514 U32 count=0;
2515
2516#ifdef DEBUGGING
c445ea15 2517 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2518#endif
2519 /* var tail is used because there may be a TAIL
2520 regop in the way. Ie, the exacts will point to the
2521 thing following the TAIL, but the last branch will
2522 point at the TAIL. So we advance tail. If we
2523 have nested (?:) we may have to move through several
2524 tails.
2525 */
2526
2527 while ( OP( tail ) == TAIL ) {
2528 /* this is the TAIL generated by (?:) */
2529 tail = regnext( tail );
2530 }
2531
3dab1dad 2532
a3621e74 2533 DEBUG_OPTIMISE_r({
32fc9b6a 2534 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2535 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2536 (int)depth * 2 + 2, "",
2537 "Looking for TRIE'able sequences. Tail node is: ",
2538 SvPV_nolen_const( mysv )
a3621e74
YO
2539 );
2540 });
3dab1dad 2541
a3621e74
YO
2542 /*
2543
2544 step through the branches, cur represents each
2545 branch, noper is the first thing to be matched
2546 as part of that branch and noper_next is the
2547 regnext() of that node. if noper is an EXACT
2548 and noper_next is the same as scan (our current
2549 position in the regex) then the EXACT branch is
2550 a possible optimization target. Once we have
2551 two or more consequetive such branches we can
2552 create a trie of the EXACT's contents and stich
2553 it in place. If the sequence represents all of
2554 the branches we eliminate the whole thing and
2555 replace it with a single TRIE. If it is a
2556 subsequence then we need to stitch it in. This
2557 means the first branch has to remain, and needs
2558 to be repointed at the item on the branch chain
2559 following the last branch optimized. This could
2560 be either a BRANCH, in which case the
2561 subsequence is internal, or it could be the
2562 item following the branch sequence in which
2563 case the subsequence is at the end.
2564
2565 */
2566
2567 /* dont use tail as the end marker for this traverse */
2568 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2569 regnode * const noper = NEXTOPER( cur );
be981c67 2570#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2571 regnode * const noper_next = regnext( noper );
be981c67 2572#endif
a3621e74 2573
a3621e74 2574 DEBUG_OPTIMISE_r({
32fc9b6a 2575 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2576 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2577 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2578
32fc9b6a 2579 regprop(RExC_rx, mysv, noper);
a3621e74 2580 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2581 SvPV_nolen_const(mysv));
a3621e74
YO
2582
2583 if ( noper_next ) {
32fc9b6a 2584 regprop(RExC_rx, mysv, noper_next );
a3621e74 2585 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2586 SvPV_nolen_const(mysv));
a3621e74 2587 }
3dab1dad
YO
2588 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2589 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2590 });
3dab1dad
YO
2591 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2592 : PL_regkind[ OP( noper ) ] == EXACT )
2593 || OP(noper) == NOTHING )
786e8c11
YO
2594#ifdef NOJUMPTRIE
2595 && noper_next == tail
2596#endif
2597 && count < U16_MAX)
a3621e74
YO
2598 {
2599 count++;
3dab1dad
YO
2600 if ( !first || optype == NOTHING ) {
2601 if (!first) first = cur;
a3621e74
YO
2602 optype = OP( noper );
2603 } else {
a3621e74 2604 last = cur;
a3621e74
YO
2605 }
2606 } else {
2607 if ( last ) {
786e8c11
YO
2608 make_trie( pRExC_state,
2609 startbranch, first, cur, tail, count,
2610 optype, depth+1 );
a3621e74 2611 }
3dab1dad 2612 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2613#ifdef NOJUMPTRIE
2614 && noper_next == tail
2615#endif
2616 ){
a3621e74
YO
2617 count = 1;
2618 first = cur;
2619 optype = OP( noper );
2620 } else {
2621 count = 0;
2622 first = NULL;
2623 optype = 0;
2624 }
2625 last = NULL;
2626 }
2627 }
2628 DEBUG_OPTIMISE_r({
32fc9b6a 2629 regprop(RExC_rx, mysv, cur);
a3621e74 2630 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2631 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2632 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2633
2634 });
2635 if ( last ) {
786e8c11 2636 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2637#ifdef TRIE_STUDY_OPT
786e8c11
YO
2638 if ( ((made == MADE_EXACT_TRIE &&
2639 startbranch == first)
2640 || ( first_non_open == first )) &&
2641 depth==0 )
2642 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2643#endif
07be1b83 2644 }
a3621e74 2645 }
3dab1dad
YO
2646
2647 } /* do trie */
786e8c11 2648
a0ed51b3 2649 }
a3621e74 2650 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2651 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2652 } else /* single branch is optimized. */
c277df42
IZ
2653 scan = NEXTOPER(scan);
2654 continue;
a0ed51b3
LW
2655 }
2656 else if (OP(scan) == EXACT) {
cd439c50 2657 I32 l = STR_LEN(scan);
c445ea15 2658 UV uc;
a0ed51b3 2659 if (UTF) {
a3b680e6 2660 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2661 l = utf8_length(s, s + l);
9041c2e3 2662 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2663 } else {
2664 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2665 }
2666 min += l;
c277df42 2667 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2668 /* The code below prefers earlier match for fixed
2669 offset, later match for variable offset. */
2670 if (data->last_end == -1) { /* Update the start info. */
2671 data->last_start_min = data->pos_min;
2672 data->last_start_max = is_inf
b81d288d 2673 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2674 }
cd439c50 2675 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2676 if (UTF)
2677 SvUTF8_on(data->last_found);
0eda9292 2678 {
9a957fbc 2679 SV * const sv = data->last_found;
a28509cc 2680 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2681 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2682 if (mg && mg->mg_len >= 0)
5e43f467
JH
2683 mg->mg_len += utf8_length((U8*)STRING(scan),
2684 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2685 }
c277df42
IZ
2686 data->last_end = data->pos_min + l;
2687 data->pos_min += l; /* As in the first entry. */
2688 data->flags &= ~SF_BEFORE_EOL;
2689 }
653099ff
GS
2690 if (flags & SCF_DO_STCLASS_AND) {
2691 /* Check whether it is compatible with what we know already! */
2692 int compat = 1;
2693
1aa99e6b 2694 if (uc >= 0x100 ||
516a5887 2695 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2696 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2697 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2698 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2699 )
653099ff
GS
2700 compat = 0;
2701 ANYOF_CLASS_ZERO(data->start_class);
2702 ANYOF_BITMAP_ZERO(data->start_class);
2703 if (compat)
1aa99e6b 2704 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2705 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2706 if (uc < 0x100)
2707 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2708 }
2709 else if (flags & SCF_DO_STCLASS_OR) {
2710 /* false positive possible if the class is case-folded */
1aa99e6b 2711 if (uc < 0x100)
9b877dbb
IH
2712 ANYOF_BITMAP_SET(data->start_class, uc);
2713 else
2714 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2715 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2716 cl_and(data->start_class, and_withp);
653099ff
GS
2717 }
2718 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2719 }
3dab1dad 2720 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2721 I32 l = STR_LEN(scan);
1aa99e6b 2722 UV uc = *((U8*)STRING(scan));
653099ff
GS
2723
2724 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2725 if (flags & SCF_DO_SUBSTR) {
2726 assert(data);
1de06328 2727 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2728 }
a0ed51b3 2729 if (UTF) {
6136c704 2730 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2731 l = utf8_length(s, s + l);
9041c2e3 2732 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2733 }
2734 min += l;
ecaa9b9c 2735 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2736 data->pos_min += l;
653099ff
GS
2737 if (flags & SCF_DO_STCLASS_AND) {
2738 /* Check whether it is compatible with what we know already! */
2739 int compat = 1;
2740
1aa99e6b 2741 if (uc >= 0x100 ||
516a5887 2742 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2743 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2744 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2745 compat = 0;
2746 ANYOF_CLASS_ZERO(data->start_class);
2747 ANYOF_BITMAP_ZERO(data->start_class);
2748 if (compat) {
1aa99e6b 2749 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2750 data->start_class->flags &= ~ANYOF_EOS;
2751 data->start_class->flags |= ANYOF_FOLD;
2752 if (OP(scan) == EXACTFL)
2753 data->start_class->flags |= ANYOF_LOCALE;
2754 }
2755 }
2756 else if (flags & SCF_DO_STCLASS_OR) {
2757 if (data->start_class->flags & ANYOF_FOLD) {
2758 /* false positive possible if the class is case-folded.
2759 Assume that the locale settings are the same... */
1aa99e6b
IH
2760 if (uc < 0x100)
2761 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2762 data->start_class->flags &= ~ANYOF_EOS;
2763 }
40d049e4 2764 cl_and(data->start_class, and_withp);
653099ff
GS
2765 }
2766 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2767 }
bfed75c6 2768 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2769 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2770 I32 f = flags, pos_before = 0;
d4c19fe8 2771 regnode * const oscan = scan;
653099ff
GS
2772 struct regnode_charclass_class this_class;
2773 struct regnode_charclass_class *oclass = NULL;
727f22e3 2774 I32 next_is_eval = 0;
653099ff 2775
3dab1dad 2776 switch (PL_regkind[OP(scan)]) {
653099ff 2777 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2778 scan = NEXTOPER(scan);
2779 goto finish;
2780 case PLUS:
653099ff 2781 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2782 next = NEXTOPER(scan);
653099ff 2783 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2784 mincount = 1;
2785 maxcount = REG_INFTY;
c277df42
IZ
2786 next = regnext(scan);
2787 scan = NEXTOPER(scan);
2788 goto do_curly;
2789 }
2790 }
2791 if (flags & SCF_DO_SUBSTR)
2792 data->pos_min++;
2793 min++;
2794 /* Fall through. */
2795 case STAR:
653099ff
GS
2796 if (flags & SCF_DO_STCLASS) {
2797 mincount = 0;
b81d288d 2798 maxcount = REG_INFTY;
653099ff
GS
2799 next = regnext(scan);
2800 scan = NEXTOPER(scan);
2801 goto do_curly;
2802 }
b81d288d 2803 is_inf = is_inf_internal = 1;
c277df42
IZ
2804 scan = regnext(scan);
2805 if (flags & SCF_DO_SUBSTR) {
1de06328 2806 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2807 data->longest = &(data->longest_float);
2808 }
2809 goto optimize_curly_tail;
2810 case CURLY:
40d049e4
YO
2811 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2812 && (scan->flags == stopparen))
2813 {
2814 mincount = 1;
2815 maxcount = 1;
2816 } else {
2817 mincount = ARG1(scan);
2818 maxcount = ARG2(scan);
2819 }
c277df42 2820 next = regnext(scan);
cb434fcc
IZ
2821 if (OP(scan) == CURLYX) {
2822 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2823 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2824 }
c277df42 2825 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2826 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2827 do_curly:
2828 if (flags & SCF_DO_SUBSTR) {
1de06328 2829 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2830 pos_before = data->pos_min;
2831 }
2832 if (data) {
2833 fl = data->flags;
2834 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2835 if (is_inf)
2836 data->flags |= SF_IS_INF;
2837 }
653099ff 2838 if (flags & SCF_DO_STCLASS) {
830247a4 2839 cl_init(pRExC_state, &this_class);
653099ff
GS
2840 oclass = data->start_class;
2841 data->start_class = &this_class;
2842 f |= SCF_DO_STCLASS_AND;
2843 f &= ~SCF_DO_STCLASS_OR;
2844 }
e1901655
IZ
2845 /* These are the cases when once a subexpression
2846 fails at a particular position, it cannot succeed
2847 even after backtracking at the enclosing scope.
b81d288d 2848
e1901655
IZ
2849 XXXX what if minimal match and we are at the
2850 initial run of {n,m}? */
2851 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2852 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2853
c277df42 2854 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2855 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2856 last, data, stopparen, recursed, NULL,
a3621e74
YO
2857 (mincount == 0
2858 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2859
2860 if (flags & SCF_DO_STCLASS)
2861 data->start_class = oclass;
2862 if (mincount == 0 || minnext == 0) {
2863 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2864 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2865 }
2866 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2867 /* Switch to OR mode: cache the old value of
653099ff 2868 * data->start_class */
40d049e4
YO
2869 INIT_AND_WITHP;
2870 StructCopy(data->start_class, and_withp,
653099ff
GS
2871 struct regnode_charclass_class);
2872 flags &= ~SCF_DO_STCLASS_AND;
2873 StructCopy(&this_class, data->start_class,
2874 struct regnode_charclass_class);
2875 flags |= SCF_DO_STCLASS_OR;
2876 data->start_class->flags |= ANYOF_EOS;
2877 }
2878 } else { /* Non-zero len */
2879 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2880 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2881 cl_and(data->start_class, and_withp);
653099ff
GS
2882 }
2883 else if (flags & SCF_DO_STCLASS_AND)
2884 cl_and(data->start_class, &this_class);
2885 flags &= ~SCF_DO_STCLASS;
2886 }
c277df42
IZ
2887 if (!scan) /* It was not CURLYX, but CURLY. */
2888 scan = next;
041457d9
DM
2889 if ( /* ? quantifier ok, except for (?{ ... }) */
2890 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2891 && (minnext == 0) && (deltanext == 0)
99799961 2892 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2893 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2894 && ckWARN(WARN_REGEXP))
b45f050a 2895 {
830247a4 2896 vWARN(RExC_parse,
b45f050a
JF
2897 "Quantifier unexpected on zero-length expression");
2898 }
2899
c277df42 2900 min += minnext * mincount;
b81d288d 2901 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2902 && (minnext + deltanext) > 0)
2903 || deltanext == I32_MAX);
aca2d497 2904 is_inf |= is_inf_internal;
c277df42
IZ
2905 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2906
2907 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2908 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2909 && data->flags & SF_IN_PAR
2910 && !(data->flags & SF_HAS_EVAL)
2911 && !deltanext && minnext == 1 ) {
2912 /* Try to optimize to CURLYN. */
2913 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2914 regnode * const nxt1 = nxt;
497b47a8
JH
2915#ifdef DEBUGGING
2916 regnode *nxt2;
2917#endif
c277df42
IZ
2918
2919 /* Skip open. */
2920 nxt = regnext(nxt);
bfed75c6 2921 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2922 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2923 && STR_LEN(nxt) == 1))
c277df42 2924 goto nogo;
497b47a8 2925#ifdef DEBUGGING
c277df42 2926 nxt2 = nxt;
497b47a8 2927#endif
c277df42 2928 nxt = regnext(nxt);
b81d288d 2929 if (OP(nxt) != CLOSE)
c277df42 2930 goto nogo;
40d049e4
YO
2931 if (RExC_open_parens) {
2932 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2933 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2934 }
c277df42 2935 /* Now we know that nxt2 is the only contents: */
eb160463 2936 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2937 OP(oscan) = CURLYN;
2938 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2939
c277df42
IZ
2940#ifdef DEBUGGING
2941 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2942 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2943 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2944 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2945 OP(nxt + 1) = OPTIMIZED; /* was count. */
2946 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2947#endif
c277df42 2948 }
c277df42
IZ
2949 nogo:
2950
2951 /* Try optimization CURLYX => CURLYM. */
b81d288d 2952 if ( OP(oscan) == CURLYX && data
c277df42 2953 && !(data->flags & SF_HAS_PAR)
c277df42 2954 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2955 && !deltanext /* atom is fixed width */
2956 && minnext != 0 /* CURLYM can't handle zero width */
2957 ) {
c277df42
IZ
2958 /* XXXX How to optimize if data == 0? */
2959 /* Optimize to a simpler form. */
2960 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2961 regnode *nxt2;
2962
2963 OP(oscan) = CURLYM;
2964 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2965 && (OP(nxt2) != WHILEM))
c277df42
IZ
2966 nxt = nxt2;
2967 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2968 /* Need to optimize away parenths. */
2969 if (data->flags & SF_IN_PAR) {
2970 /* Set the parenth number. */
2971 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2972
b81d288d 2973 if (OP(nxt) != CLOSE)
b45f050a 2974 FAIL("Panic opt close");
eb160463 2975 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2976 if (RExC_open_parens) {
2977 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2978 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2979 }
c277df42
IZ
2980 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2981 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2982
c277df42
IZ
2983#ifdef DEBUGGING
2984 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2985 OP(nxt + 1) = OPTIMIZED; /* was count. */
2986 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2987 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2988#endif
c277df42
IZ
2989#if 0
2990 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2991 regnode *nnxt = regnext(nxt1);
b81d288d 2992
c277df42
IZ
2993 if (nnxt == nxt) {
2994 if (reg_off_by_arg[OP(nxt1)])
2995 ARG_SET(nxt1, nxt2 - nxt1);
2996 else if (nxt2 - nxt1 < U16_MAX)
2997 NEXT_OFF(nxt1) = nxt2 - nxt1;
2998 else
2999 OP(nxt) = NOTHING; /* Cannot beautify */
3000 }
3001 nxt1 = nnxt;
3002 }
3003#endif
3004 /* Optimize again: */
1de06328 3005 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3006 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3007 }
3008 else
c277df42 3009 oscan->flags = 0;
c277df42 3010 }
e1901655
IZ
3011 else if ((OP(oscan) == CURLYX)
3012 && (flags & SCF_WHILEM_VISITED_POS)
3013 /* See the comment on a similar expression above.
3014 However, this time it not a subexpression
3015 we care about, but the expression itself. */
3016 && (maxcount == REG_INFTY)
3017 && data && ++data->whilem_c < 16) {
3018 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3019 /* Find WHILEM (as in regexec.c) */
3020 regnode *nxt = oscan + NEXT_OFF(oscan);
3021
3022 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3023 nxt += ARG(nxt);
eb160463
GS
3024 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3025 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3026 }
b81d288d 3027 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3028 pars++;
3029 if (flags & SCF_DO_SUBSTR) {
c445ea15 3030 SV *last_str = NULL;
c277df42
IZ
3031 int counted = mincount != 0;
3032
3033 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3034#if defined(SPARC64_GCC_WORKAROUND)
3035 I32 b = 0;
3036 STRLEN l = 0;
cfd0369c 3037 const char *s = NULL;
5d1c421c
JH
3038 I32 old = 0;
3039
3040 if (pos_before >= data->last_start_min)
3041 b = pos_before;
3042 else
3043 b = data->last_start_min;
3044
3045 l = 0;
cfd0369c 3046 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3047 old = b - data->last_start_min;
3048
3049#else
b81d288d 3050 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3051 ? pos_before : data->last_start_min;
3052 STRLEN l;
d4c19fe8 3053 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3054 I32 old = b - data->last_start_min;
5d1c421c 3055#endif
a0ed51b3
LW
3056
3057 if (UTF)
3058 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3059
a0ed51b3 3060 l -= old;
c277df42 3061 /* Get the added string: */
79cb57f6 3062 last_str = newSVpvn(s + old, l);
0e933229
IH
3063 if (UTF)
3064 SvUTF8_on(last_str);
c277df42
IZ
3065 if (deltanext == 0 && pos_before == b) {
3066 /* What was added is a constant string */
3067 if (mincount > 1) {
3068 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3069 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3070 SvPVX_const(last_str), l, mincount - 1);
b162af07 3071 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3072 /* Add additional parts. */
b81d288d 3073 SvCUR_set(data->last_found,
c277df42
IZ
3074 SvCUR(data->last_found) - l);
3075 sv_catsv(data->last_found, last_str);
0eda9292
JH
3076 {
3077 SV * sv = data->last_found;
3078 MAGIC *mg =
3079 SvUTF8(sv) && SvMAGICAL(sv) ?
3080 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3081 if (mg && mg->mg_len >= 0)
3082 mg->mg_len += CHR_SVLEN(last_str);
3083 }
c277df42
IZ
3084 data->last_end += l * (mincount - 1);
3085 }
2a8d9689
HS
3086 } else {
3087 /* start offset must point into the last copy */
3088 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3089 data->last_start_max += is_inf ? I32_MAX
3090 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3091 }
3092 }
3093 /* It is counted once already... */
3094 data->pos_min += minnext * (mincount - counted);
3095 data->pos_delta += - counted * deltanext +
3096 (minnext + deltanext) * maxcount - minnext * mincount;
3097 if (mincount != maxcount) {
653099ff
GS
3098 /* Cannot extend fixed substrings found inside
3099 the group. */
1de06328 3100 scan_commit(pRExC_state,data,minlenp);
c277df42 3101 if (mincount && last_str) {
d4c19fe8
AL
3102 SV * const sv = data->last_found;
3103 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3104 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3105
3106 if (mg)
3107 mg->mg_len = -1;
3108 sv_setsv(sv, last_str);
c277df42 3109 data->last_end = data->pos_min;
b81d288d 3110 data->last_start_min =
a0ed51b3 3111 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3112 data->last_start_max = is_inf
3113 ? I32_MAX
c277df42 3114 : data->pos_min + data->pos_delta
a0ed51b3 3115 - CHR_SVLEN(last_str);
c277df42
IZ
3116 }
3117 data->longest = &(data->longest_float);
3118 }
aca2d497 3119 SvREFCNT_dec(last_str);
c277df42 3120 }
405ff068 3121 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3122 data->flags |= SF_HAS_EVAL;
3123 optimize_curly_tail:
c277df42 3124 if (OP(oscan) != CURLYX) {
3dab1dad 3125 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3126 && NEXT_OFF(next))
3127 NEXT_OFF(oscan) += NEXT_OFF(next);
3128 }
c277df42 3129 continue;
653099ff 3130 default: /* REF and CLUMP only? */
c277df42 3131 if (flags & SCF_DO_SUBSTR) {
1de06328 3132 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3133 data->longest = &(data->longest_float);
3134 }
aca2d497 3135 is_inf = is_inf_internal = 1;
653099ff 3136 if (flags & SCF_DO_STCLASS_OR)
830247a4 3137 cl_anything(pRExC_state, data->start_class);
653099ff 3138 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3139 break;
3140 }
a0ed51b3 3141 }
bfed75c6 3142 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3143 int value = 0;
653099ff 3144
c277df42 3145 if (flags & SCF_DO_SUBSTR) {
1de06328 3146 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3147 data->pos_min++;
3148 }
3149 min++;
653099ff
GS
3150 if (flags & SCF_DO_STCLASS) {
3151 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3152
3153 /* Some of the logic below assumes that switching
3154 locale on will only add false positives. */
3dab1dad 3155 switch (PL_regkind[OP(scan)]) {
653099ff 3156 case SANY:
653099ff
GS
3157 default:
3158 do_default:
3159 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3160 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3161 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3162 break;
3163 case REG_ANY:
3164 if (OP(scan) == SANY)
3165 goto do_default;
3166 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3167 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3168 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3169 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3170 }
3171 if (flags & SCF_DO_STCLASS_AND || !value)
3172 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3173 break;
3174 case ANYOF:
3175 if (flags & SCF_DO_STCLASS_AND)
3176 cl_and(data->start_class,
3177 (struct regnode_charclass_class*)scan);
3178 else
830247a4 3179 cl_or(pRExC_state, data->start_class,
653099ff
GS
3180 (struct regnode_charclass_class*)scan);
3181 break;
3182 case ALNUM:
3183 if (flags & SCF_DO_STCLASS_AND) {
3184 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3185 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3186 for (value = 0; value < 256; value++)
3187 if (!isALNUM(value))
3188 ANYOF_BITMAP_CLEAR(data->start_class, value);
3189 }
3190 }
3191 else {
3192 if (data->start_class->flags & ANYOF_LOCALE)
3193 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3194 else {
3195 for (value = 0; value < 256; value++)
3196 if (isALNUM(value))
b81d288d 3197 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3198 }
3199 }
3200 break;
3201 case ALNUML:
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 if (data->start_class->flags & ANYOF_LOCALE)
3204 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3205 }
3206 else {
3207 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3208 data->start_class->flags |= ANYOF_LOCALE;
3209 }
3210 break;
3211 case NALNUM:
3212 if (flags & SCF_DO_STCLASS_AND) {
3213 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3215 for (value = 0; value < 256; value++)
3216 if (isALNUM(value))
3217 ANYOF_BITMAP_CLEAR(data->start_class, value);
3218 }
3219 }
3220 else {
3221 if (data->start_class->flags & ANYOF_LOCALE)
3222 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3223 else {
3224 for (value = 0; value < 256; value++)
3225 if (!isALNUM(value))
b81d288d 3226 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3227 }
3228 }
3229 break;
3230 case NALNUML:
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 if (data->start_class->flags & ANYOF_LOCALE)
3233 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3234 }
3235 else {
3236 data->start_class->flags |= ANYOF_LOCALE;
3237 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3238 }
3239 break;
3240 case SPACE:
3241 if (flags & SCF_DO_STCLASS_AND) {
3242 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3243 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3244 for (value = 0; value < 256; value++)
3245 if (!isSPACE(value))
3246 ANYOF_BITMAP_CLEAR(data->start_class, value);
3247 }
3248 }
3249 else {
3250 if (data->start_class->flags & ANYOF_LOCALE)
3251 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3252 else {
3253 for (value = 0; value < 256; value++)
3254 if (isSPACE(value))
b81d288d 3255 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3256 }
3257 }
3258 break;
3259 case SPACEL:
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 if (data->start_class->flags & ANYOF_LOCALE)
3262 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3263 }
3264 else {
3265 data->start_class->flags |= ANYOF_LOCALE;
3266 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3267 }
3268 break;
3269 case NSPACE:
3270 if (flags & SCF_DO_STCLASS_AND) {
3271 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3273 for (value = 0; value < 256; value++)
3274 if (isSPACE(value))
3275 ANYOF_BITMAP_CLEAR(data->start_class, value);
3276 }
3277 }
3278 else {
3279 if (data->start_class->flags & ANYOF_LOCALE)
3280 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3281 else {
3282 for (value = 0; value < 256; value++)
3283 if (!isSPACE(value))
b81d288d 3284 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3285 }
3286 }
3287 break;
3288 case NSPACEL:
3289 if (flags & SCF_DO_STCLASS_AND) {
3290 if (data->start_class->flags & ANYOF_LOCALE) {
3291 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3292 for (value = 0; value < 256; value++)
3293 if (!isSPACE(value))
3294 ANYOF_BITMAP_CLEAR(data->start_class, value);
3295 }
3296 }
3297 else {
3298 data->start_class->flags |= ANYOF_LOCALE;
3299 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3300 }
3301 break;
3302 case DIGIT:
3303 if (flags & SCF_DO_STCLASS_AND) {
3304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3305 for (value = 0; value < 256; value++)
3306 if (!isDIGIT(value))
3307 ANYOF_BITMAP_CLEAR(data->start_class, value);
3308 }
3309 else {
3310 if (data->start_class->flags & ANYOF_LOCALE)
3311 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3312 else {
3313 for (value = 0; value < 256; value++)
3314 if (isDIGIT(value))
b81d288d 3315 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3316 }
3317 }
3318 break;
3319 case NDIGIT:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3322 for (value = 0; value < 256; value++)
3323 if (isDIGIT(value))
3324 ANYOF_BITMAP_CLEAR(data->start_class, value);
3325 }
3326 else {
3327 if (data->start_class->flags & ANYOF_LOCALE)
3328 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3329 else {
3330 for (value = 0; value < 256; value++)
3331 if (!isDIGIT(value))
b81d288d 3332 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3333 }
3334 }
3335 break;
3336 }
3337 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3338 cl_and(data->start_class, and_withp);
653099ff
GS
3339 flags &= ~SCF_DO_STCLASS;
3340 }
a0ed51b3 3341 }
3dab1dad 3342 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3343 data->flags |= (OP(scan) == MEOL
3344 ? SF_BEFORE_MEOL
3345 : SF_BEFORE_SEOL);
a0ed51b3 3346 }
3dab1dad 3347 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3348 /* Lookbehind, or need to calculate parens/evals/stclass: */
3349 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3350 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3351 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3352 || OP(scan) == UNLESSM )
3353 {
3354 /* Negative Lookahead/lookbehind
3355 In this case we can't do fixed string optimisation.
3356 */
3357
3358 I32 deltanext, minnext, fake = 0;
3359 regnode *nscan;
3360 struct regnode_charclass_class intrnl;
3361 int f = 0;
3362
3363 data_fake.flags = 0;
3364 if (data) {
3365 data_fake.whilem_c = data->whilem_c;
3366 data_fake.last_closep = data->last_closep;
a0ed51b3 3367 }
1de06328
YO
3368 else
3369 data_fake.last_closep = &fake;
3370 if ( flags & SCF_DO_STCLASS && !scan->flags
3371 && OP(scan) == IFMATCH ) { /* Lookahead */
3372 cl_init(pRExC_state, &intrnl);
3373 data_fake.start_class = &intrnl;
3374 f |= SCF_DO_STCLASS_AND;
c277df42 3375 }
1de06328
YO
3376 if (flags & SCF_WHILEM_VISITED_POS)
3377 f |= SCF_WHILEM_VISITED_POS;
3378 next = regnext(scan);
3379 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3380 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3381 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3382 if (scan->flags) {
3383 if (deltanext) {
3384 vFAIL("Variable length lookbehind not implemented");
3385 }
3386 else if (minnext > (I32)U8_MAX) {
3387 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3388 }
3389 scan->flags = (U8)minnext;
3390 }
3391 if (data) {
3392 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3393 pars++;
3394 if (data_fake.flags & SF_HAS_EVAL)
3395 data->flags |= SF_HAS_EVAL;
3396 data->whilem_c = data_fake.whilem_c;
3397 }
3398 if (f & SCF_DO_STCLASS_AND) {
3399 const int was = (data->start_class->flags & ANYOF_EOS);
3400
3401 cl_and(data->start_class, &intrnl);
3402 if (was)
3403 data->start_class->flags |= ANYOF_EOS;
3404 }
be8e71aa 3405 }
1de06328
YO
3406#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3407 else {
3408 /* Positive Lookahead/lookbehind
3409 In this case we can do fixed string optimisation,
3410 but we must be careful about it. Note in the case of
3411 lookbehind the positions will be offset by the minimum
3412 length of the pattern, something we won't know about
3413 until after the recurse.
3414 */
3415 I32 deltanext, fake = 0;
3416 regnode *nscan;
3417 struct regnode_charclass_class intrnl;
3418 int f = 0;
3419 /* We use SAVEFREEPV so that when the full compile
3420 is finished perl will clean up the allocated
3421 minlens when its all done. This was we don't
3422 have to worry about freeing them when we know
3423 they wont be used, which would be a pain.
3424 */
3425 I32 *minnextp;
3426 Newx( minnextp, 1, I32 );
3427 SAVEFREEPV(minnextp);
3428
3429 if (data) {
3430 StructCopy(data, &data_fake, scan_data_t);
3431 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3432 f |= SCF_DO_SUBSTR;
3433 if (scan->flags)
3434 scan_commit(pRExC_state, &data_fake,minlenp);
3435 data_fake.last_found=newSVsv(data->last_found);
3436 }
3437 }
3438 else
3439 data_fake.last_closep = &fake;
3440 data_fake.flags = 0;
3441 if (is_inf)
3442 data_fake.flags |= SF_IS_INF;
3443 if ( flags & SCF_DO_STCLASS && !scan->flags
3444 && OP(scan) == IFMATCH ) { /* Lookahead */
3445 cl_init(pRExC_state, &intrnl);
3446 data_fake.start_class = &intrnl;
3447 f |= SCF_DO_STCLASS_AND;
3448 }
3449 if (flags & SCF_WHILEM_VISITED_POS)
3450 f |= SCF_WHILEM_VISITED_POS;
3451 next = regnext(scan);
3452 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3453
3454 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3455 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3456 if (scan->flags) {
3457 if (deltanext) {
3458 vFAIL("Variable length lookbehind not implemented");
3459 }
3460 else if (*minnextp > (I32)U8_MAX) {
3461 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3462 }
3463 scan->flags = (U8)*minnextp;
3464 }
40d049e4 3465
1de06328 3466 *minnextp += min;
40d049e4 3467
1de06328
YO
3468 if (f & SCF_DO_STCLASS_AND) {
3469 const int was = (data->start_class->flags & ANYOF_EOS);
3470
3471 cl_and(data->start_class, &intrnl);
3472 if (was)
3473 data->start_class->flags |= ANYOF_EOS;
40d049e4 3474 }
1de06328
YO
3475 if (data) {
3476 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3477 pars++;
3478 if (data_fake.flags & SF_HAS_EVAL)
3479 data->flags |= SF_HAS_EVAL;
3480 data->whilem_c = data_fake.whilem_c;
3481 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3482 if (RExC_rx->minlen<*minnextp)
3483 RExC_rx->minlen=*minnextp;
3484 scan_commit(pRExC_state, &data_fake, minnextp);
3485 SvREFCNT_dec(data_fake.last_found);
3486
3487 if ( data_fake.minlen_fixed != minlenp )
3488 {
3489 data->offset_fixed= data_fake.offset_fixed;
3490 data->minlen_fixed= data_fake.minlen_fixed;
3491 data->lookbehind_fixed+= scan->flags;
3492 }
3493 if ( data_fake.minlen_float != minlenp )
3494 {
3495 data->minlen_float= data_fake.minlen_float;
3496 data->offset_float_min=data_fake.offset_float_min;
3497 data->offset_float_max=data_fake.offset_float_max;
3498 data->lookbehind_float+= scan->flags;
3499 }
3500 }
3501 }
3502
653099ff 3503
653099ff 3504 }
1de06328 3505#endif
a0ed51b3
LW
3506 }
3507 else if (OP(scan) == OPEN) {
40d049e4
YO
3508 if (stopparen != (I32)ARG(scan))
3509 pars++;
a0ed51b3 3510 }
cb434fcc 3511 else if (OP(scan) == CLOSE) {
40d049e4
YO
3512 if (stopparen == (I32)ARG(scan)) {
3513 break;
3514 }
eb160463 3515 if ((I32)ARG(scan) == is_par) {
cb434fcc 3516 next = regnext(scan);
c277df42 3517
cb434fcc
IZ
3518 if ( next && (OP(next) != WHILEM) && next < last)
3519 is_par = 0; /* Disable optimization */
3520 }
3521 if (data)
3522 *(data->last_closep) = ARG(scan);
a0ed51b3 3523 }
1a147d38 3524 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3525 /* set the pointer */
3526 I32 paren;
3527 regnode *start;
3528 regnode *end;
1a147d38 3529 if (OP(scan) == GOSUB) {
40d049e4
YO
3530 paren = ARG(scan);
3531 RExC_recurse[ARG2L(scan)] = scan;
3532 start = RExC_open_parens[paren-1];
3533 end = RExC_close_parens[paren-1];
3534 } else {
3535 paren = 0;
3536 start = RExC_rx->program + 1;
3537 end = RExC_opend;
3538 }
3539 assert(start);
3540 assert(end);
3541 if (!recursed) {
3542 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3543 SAVEFREEPV(recursed);
3544 }
3545 if (!PAREN_TEST(recursed,paren+1)) {
3546 I32 deltanext = 0;
3547 PAREN_SET(recursed,paren+1);
3548
3549 DEBUG_PEEP("goto",start,depth);
3550 min += study_chunk(
3551 pRExC_state,
3552 &start,
3553 minlenp,
3554 &deltanext,
3555 end+1,
3556 data,
3557 paren,
3558 recursed,
3559 and_withp,
3560 flags,depth+1);
3561 delta+=deltanext;
3562 if (deltanext == I32_MAX) {
3563 is_inf = is_inf_internal = 1;
3564 delta=deltanext;
3565 }
3566 DEBUG_PEEP("rtrn",end,depth);
3567 PAREN_UNSET(recursed,paren+1);
3568 } else {
3569 if (flags & SCF_DO_SUBSTR) {
3570 scan_commit(pRExC_state,data,minlenp);
3571 data->longest = &(data->longest_float);
3572 }
3573 is_inf = is_inf_internal = 1;
3574 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3575 cl_anything(pRExC_state, data->start_class);
3576 flags &= ~SCF_DO_STCLASS;
3577 }
3578 }
a0ed51b3 3579 else if (OP(scan) == EVAL) {
c277df42
IZ
3580 if (data)
3581 data->flags |= SF_HAS_EVAL;
3582 }
40d049e4
YO
3583 else if ( OP(scan)==OPFAIL ) {
3584 if (flags & SCF_DO_SUBSTR) {
3585 scan_commit(pRExC_state,data,minlenp);
3586 flags &= ~SCF_DO_SUBSTR;
3587 }
3588 }
3589 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3590 {
0f5d15d6 3591 if (flags & SCF_DO_SUBSTR) {
1de06328 3592 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3593 data->longest = &(data->longest_float);
3594 }
3595 is_inf = is_inf_internal = 1;
653099ff 3596 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3597 cl_anything(pRExC_state, data->start_class);
96776eda 3598 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3599 }
786e8c11 3600#ifdef TRIE_STUDY_OPT
40d049e4 3601#ifdef FULL_TRIE_STUDY
786e8c11
YO
3602 else if (PL_regkind[OP(scan)] == TRIE) {
3603 /* NOTE - There is similar code to this block above for handling
40d049e4 3604 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3605 check there too. */
7f69552c 3606 regnode *trie_node= scan;
786e8c11
YO
3607 regnode *tail= regnext(scan);
3608 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3609 I32 max1 = 0, min1 = I32_MAX;
3610 struct regnode_charclass_class accum;
3611
3612 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3613 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3614 if (flags & SCF_DO_STCLASS)
3615 cl_init_zero(pRExC_state, &accum);
3616
3617 if (!trie->jump) {
3618 min1= trie->minlen;
3619 max1= trie->maxlen;
3620 } else {
3621 const regnode *nextbranch= NULL;
3622 U32 word;
3623
3624 for ( word=1 ; word <= trie->wordcount ; word++)
3625 {
3626 I32 deltanext=0, minnext=0, f = 0, fake;
3627 struct regnode_charclass_class this_class;
3628
3629 data_fake.flags = 0;
3630 if (data) {
3631 data_fake.whilem_c = data->whilem_c;
3632 data_fake.last_closep = data->last_closep;
3633 }
3634 else
3635 data_fake.last_closep = &fake;
3636
3637 if (flags & SCF_DO_STCLASS) {
3638 cl_init(pRExC_state, &this_class);
3639 data_fake.start_class = &this_class;
3640 f = SCF_DO_STCLASS_AND;
3641 }
3642 if (flags & SCF_WHILEM_VISITED_POS)
3643 f |= SCF_WHILEM_VISITED_POS;
3644
3645 if (trie->jump[word]) {
3646 if (!nextbranch)
7f69552c
YO
3647 nextbranch = trie_node + trie->jump[0];
3648 scan= trie_node + trie->jump[word];
786e8c11
YO
3649 /* We go from the jump point to the branch that follows
3650 it. Note this means we need the vestigal unused branches
3651 even though they arent otherwise used.
3652 */
40d049e4
YO
3653 minnext = study_chunk(pRExC_state, &scan, minlenp,
3654 &deltanext, (regnode *)nextbranch, &data_fake,
3655 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3656 }
3657 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3658 nextbranch= regnext((regnode*)nextbranch);
3659
3660 if (min1 > (I32)(minnext + trie->minlen))
3661 min1 = minnext + trie->minlen;
3662 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3663 max1 = minnext + deltanext + trie->maxlen;
3664 if (deltanext == I32_MAX)
3665 is_inf = is_inf_internal = 1;
3666
3667 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3668 pars++;
3669
3670 if (data) {
3671 if (data_fake.flags & SF_HAS_EVAL)
3672 data->flags |= SF_HAS_EVAL;
3673 data->whilem_c = data_fake.whilem_c;
3674 }
3675 if (flags & SCF_DO_STCLASS)
3676 cl_or(pRExC_state, &accum, &this_class);
3677 }
3678 }
3679 if (flags & SCF_DO_SUBSTR) {
3680 data->pos_min += min1;
3681 data->pos_delta += max1 - min1;
3682 if (max1 != min1 || is_inf)
3683 data->longest = &(data->longest_float);
3684 }
3685 min += min1;
3686 delta += max1 - min1;
3687 if (flags & SCF_DO_STCLASS_OR) {
3688 cl_or(pRExC_state, data->start_class, &accum);
3689 if (min1) {
40d049e4 3690 cl_and(data->start_class, and_withp);
786e8c11
YO
3691 flags &= ~SCF_DO_STCLASS;
3692 }
3693 }
3694 else if (flags & SCF_DO_STCLASS_AND) {
3695 if (min1) {
3696 cl_and(data->start_class, &accum);
3697 flags &= ~SCF_DO_STCLASS;
3698 }
3699 else {
3700 /* Switch to OR mode: cache the old value of
3701 * data->start_class */
40d049e4
YO
3702 INIT_AND_WITHP;
3703 StructCopy(data->start_class, and_withp,
786e8c11