This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Doc nits to re.pm
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
553 PerlIO_printf(Perl_debug_log, \
554 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1897 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1898 DEBUG_OPTIMISE_r(
8e11feef
RGS
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
786e8c11 1902 (UV)state, (UV)idx, ch)
3dab1dad
YO
1903 );
1904 if ( state==1 ) {
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1907 STR_LEN(convert)=0;
1908 }
1909 *str++=*ch;
1910 STR_LEN(convert)++;
a3621e74 1911
8e11feef 1912 } else {
f9049ba1 1913#ifdef DEBUGGING
8e11feef
RGS
1914 if (state>1)
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1916#endif
8e11feef
RGS
1917 break;
1918 }
1919 }
3dab1dad 1920 if (str) {
8e11feef 1921 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1923 trie->startstate = state;
07be1b83
YO
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1926 DEBUG_r({
1927 regnode *fix = convert;
1928 mjd_nodelen++;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1932 }
1933 });
8e11feef
RGS
1934 if (trie->maxlen) {
1935 convert = n;
1936 } else {
3dab1dad 1937 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1938 DEBUG_r(optimize= n);
3dab1dad
YO
1939 }
1940 }
1941 }
a5ca303d
YO
1942 if (!jumper)
1943 jumper = last;
3dab1dad 1944 if ( trie->maxlen ) {
8e11feef
RGS
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
786e8c11
YO
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1950 if (trie->jump)
7f69552c 1951 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1952
786e8c11
YO
1953 /* XXXX */
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1956 {
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1960 trie->bitmap= NULL;
1961 } else
1962 OP( convert ) = TRIE;
a3621e74 1963
3dab1dad
YO
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
a5ca303d
YO
1966 DEBUG_r({
1967 optimize = convert
1968 + NODE_STEP_REGNODE
1969 + regarglen[ OP( convert ) ];
1970 });
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
3dab1dad 1973 }
a3621e74 1974 /* needed for dumping*/
e62cc96a 1975 DEBUG_r(if (optimize) {
07be1b83 1976 regnode *opt = convert;
e62cc96a 1977 while ( ++opt < optimize) {
07be1b83
YO
1978 Set_Node_Offset_Length(opt,0,0);
1979 }
786e8c11
YO
1980 /*
1981 Try to clean up some of the debris left after the
1982 optimisation.
a3621e74 1983 */
786e8c11 1984 while( optimize < jumper ) {
07be1b83 1985 mjd_nodelen += Node_Length((optimize));
a3621e74 1986 OP( optimize ) = OPTIMIZED;
07be1b83 1987 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1988 optimize++;
1989 }
07be1b83 1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1991 });
1992 } /* end node insert */
07be1b83 1993#ifndef DEBUGGING
6e8b4190 1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1995#endif
786e8c11
YO
1996 return trie->jump
1997 ? MADE_JUMP_TRIE
1998 : trie->startstate>1
1999 ? MADE_EXACT_TRIE
2000 : MADE_TRIE;
2001}
2002
2003STATIC void
2004S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2005{
2006/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2007
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 ISBN 0-201-10088-6
2011
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2017 Consider
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2022 */
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2025 U32 *q;
2026 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2027 const U32 numstates = trie->statecount;
786e8c11
YO
2028 const U32 ubound = trie->lasttrans + ucharcount;
2029 U32 q_read = 0;
2030 U32 q_write = 0;
2031 U32 charid;
2032 U32 base = trie->states[ 1 ].trans.base;
2033 U32 *fail;
2034 reg_ac_data *aho;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2037#ifndef DEBUGGING
2038 PERL_UNUSED_ARG(depth);
2039#endif
2040
2041
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2045 aho->trie=trie;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2047 numstates * sizeof(reg_trie_state));
786e8c11
YO
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2050 aho->refcount = 1;
2051 fail = aho->fail;
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2055
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2058 if ( newstate ) {
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2062 }
2063 }
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2067
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2070 if (ch_state) {
2071 U32 fail_state = cur;
2072 U32 fail_base;
2073 do {
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2077
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2081 {
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2083 }
2084 q[ q_write++ % numstates] = ch_state;
2085 }
2086 }
2087 }
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2093 */
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2099 );
786e8c11
YO
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2102 }
2103 PerlIO_printf(Perl_debug_log, "\n");
2104 });
2105 Safefree(q);
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2107}
2108
786e8c11 2109
a3621e74 2110/*
5d1c421c
JH
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2113 */
2114#if defined(__sparc64__) && defined(__GNUC__)
2115# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116# undef SPARC64_GCC_WORKAROUND
2117# define SPARC64_GCC_WORKAROUND 1
2118# endif
2119#endif
2120
07be1b83
YO
2121#define DEBUG_PEEP(str,scan,depth) \
2122 DEBUG_OPTIMISE_r({ \
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
7f69552c 2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2129 });
2130
1de06328
YO
2131
2132
2133
2134
07be1b83
YO
2135#define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138
be8e71aa 2139STATIC U32
07be1b83
YO
2140S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2143 U32 stringok = 1;
2144 regnode *next = scan + NODE_SZ_STR(scan);
2145 U32 merged = 0;
2146 U32 stopnow = 0;
2147#ifdef DEBUGGING
2148 regnode *stop = scan;
72f13be8 2149 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2150#else
d47053eb
RGS
2151 PERL_UNUSED_ARG(depth);
2152#endif
2153#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
07be1b83 2156#endif
07be1b83
YO
2157 DEBUG_PEEP("join",scan,depth);
2158
2159 /* Skip NOTHING, merge EXACT*. */
2160 while (n &&
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2163 && NEXT_OFF(n)
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2165
2166 if (OP(n) == TAIL || n > next)
2167 stringok = 0;
2168 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2172#ifdef DEBUGGING
2173 if (stringok)
2174 stop = n;
2175#endif
2176 n = regnext(n);
2177 }
2178 else if (stringok) {
786e8c11 2179 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2180 regnode * const nnext = regnext(n);
2181
2182 DEBUG_PEEP("merg",n,depth);
2183
2184 merged++;
2185 if (oldl + STR_LEN(n) > U8_MAX)
2186 break;
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2192#ifdef DEBUGGING
2193 stop = next - 1;
2194#endif
2195 n = nnext;
2196 if (stopnow) break;
2197 }
2198
d47053eb
RGS
2199#ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2204 }
2205 else {
2206 NEXT_OFF(n) = val - n;
2207 }
2208 stopnow = 1;
2209 }
07be1b83
YO
2210#endif
2211 }
2212
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2214 /*
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2216
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2219
2220 which casefold to
2221
2222 Unicode UTF-8
2223
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2226
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2232
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2236
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2240
2241 */
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
e294cc5d
JH
2245#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2247#else
07be1b83 2248 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2249#endif
07be1b83
YO
2250 const char * const t1 = t0 + 3;
2251
2252 for (s = s0 + 2;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 s = t + 4) {
e294cc5d
JH
2255#ifdef EBCDIC
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2258#else
07be1b83
YO
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2261#endif
07be1b83
YO
2262 *min -= 4;
2263 }
2264 }
2265
2266#ifdef DEBUGGING
2267 /* Allow dumping */
2268 n = scan + NODE_SZ_STR(scan);
2269 while (n <= stop) {
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2271 OP(n) = OPTIMIZED;
2272 NEXT_OFF(n) = 0;
2273 }
2274 n++;
2275 }
2276#endif
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 return stopnow;
2279}
2280
653099ff
GS
2281/* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2283
a0288114 2284/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2285 to the position after last scanned or to NULL. */
2286
40d049e4
YO
2287#define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
07be1b83 2291
76e3520e 2292STATIC I32
40d049e4 2293S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2294 I32 *minlenp, I32 *deltap,
40d049e4
YO
2295 regnode *last,
2296 scan_data_t *data,
2297 I32 stopparen,
2298 U8* recursed,
2299 struct regnode_charclass_class *and_withp,
2300 U32 flags, U32 depth)
c277df42
IZ
2301 /* scanp: Start here (read-write). */
2302 /* deltap: Write maxlen-minlen here. */
2303 /* last: Stop before this one. */
40d049e4
YO
2304 /* data: string data about the pattern */
2305 /* stopparen: treat close N as END */
2306 /* recursed: which subroutines have we recursed into */
2307 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2308{
97aff369 2309 dVAR;
c277df42
IZ
2310 I32 min = 0, pars = 0, code;
2311 regnode *scan = *scanp, *next;
2312 I32 delta = 0;
2313 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2314 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2315 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2316 scan_data_t data_fake;
a3621e74 2317 SV *re_trie_maxbuff = NULL;
786e8c11 2318 regnode *first_non_open = scan;
e2e6a0f1 2319 I32 stopmin = I32_MAX;
a3621e74 2320 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2321#ifdef DEBUGGING
40d049e4 2322 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2323#endif
40d049e4 2324
786e8c11 2325 if ( depth == 0 ) {
40d049e4 2326 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2327 first_non_open=regnext(first_non_open);
2328 }
2329
b81d288d 2330
c277df42
IZ
2331 while (scan && OP(scan) != END && scan < last) {
2332 /* Peephole optimizer: */
1de06328 2333 DEBUG_STUDYDATA(data,depth);
07be1b83 2334 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2335 JOIN_EXACT(scan,&min,0);
a3621e74 2336
653099ff
GS
2337 /* Follow the next-chain of the current node and optimize
2338 away all the NOTHINGs from it. */
c277df42 2339 if (OP(scan) != CURLYX) {
a3b680e6 2340 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2341 ? I32_MAX
2342 /* I32 may be smaller than U16 on CRAYs! */
2343 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2344 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2345 int noff;
2346 regnode *n = scan;
b81d288d 2347
c277df42
IZ
2348 /* Skip NOTHING and LONGJMP. */
2349 while ((n = regnext(n))
3dab1dad 2350 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2351 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2352 && off + noff < max)
2353 off += noff;
2354 if (reg_off_by_arg[OP(scan)])
2355 ARG(scan) = off;
b81d288d 2356 else
c277df42
IZ
2357 NEXT_OFF(scan) = off;
2358 }
a3621e74 2359
07be1b83 2360
3dab1dad 2361
653099ff
GS
2362 /* The principal pseudo-switch. Cannot be a switch, since we
2363 look into several different things. */
b81d288d 2364 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2365 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2366 next = regnext(scan);
2367 code = OP(scan);
a3621e74 2368 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2369
2370 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2371 /* NOTE - There is similar code to this block below for handling
2372 TRIE nodes on a re-study. If you change stuff here check there
2373 too. */
c277df42 2374 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2375 struct regnode_charclass_class accum;
d4c19fe8 2376 regnode * const startbranch=scan;
c277df42 2377
653099ff 2378 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2379 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2380 if (flags & SCF_DO_STCLASS)
830247a4 2381 cl_init_zero(pRExC_state, &accum);
a3621e74 2382
c277df42 2383 while (OP(scan) == code) {
830247a4 2384 I32 deltanext, minnext, f = 0, fake;
653099ff 2385 struct regnode_charclass_class this_class;
c277df42
IZ
2386
2387 num++;
2388 data_fake.flags = 0;
40d049e4 2389 if (data) {
2c2d71f5 2390 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2391 data_fake.last_closep = data->last_closep;
2392 }
2393 else
2394 data_fake.last_closep = &fake;
c277df42
IZ
2395 next = regnext(scan);
2396 scan = NEXTOPER(scan);
2397 if (code != BRANCH)
2398 scan = NEXTOPER(scan);
653099ff 2399 if (flags & SCF_DO_STCLASS) {
830247a4 2400 cl_init(pRExC_state, &this_class);
653099ff
GS
2401 data_fake.start_class = &this_class;
2402 f = SCF_DO_STCLASS_AND;
b81d288d 2403 }
e1901655
IZ
2404 if (flags & SCF_WHILEM_VISITED_POS)
2405 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2406
653099ff 2407 /* we suppose the run is continuous, last=next...*/
1de06328 2408 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2409 next, &data_fake,
2410 stopparen, recursed, NULL, f,depth+1);
b81d288d 2411 if (min1 > minnext)
c277df42
IZ
2412 min1 = minnext;
2413 if (max1 < minnext + deltanext)
2414 max1 = minnext + deltanext;
2415 if (deltanext == I32_MAX)
aca2d497 2416 is_inf = is_inf_internal = 1;
c277df42
IZ
2417 scan = next;
2418 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2419 pars++;
e2e6a0f1
YO
2420 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2421 if ( stopmin > minnext)
2422 stopmin = min + min1;
2423 flags &= ~SCF_DO_SUBSTR;
2424 if (data)
2425 data->flags |= SCF_SEEN_ACCEPT;
2426 }
3dab1dad
YO
2427 if (data) {
2428 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2429 data->flags |= SF_HAS_EVAL;
2c2d71f5 2430 data->whilem_c = data_fake.whilem_c;
3dab1dad 2431 }
653099ff 2432 if (flags & SCF_DO_STCLASS)
830247a4 2433 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2434 if (code == SUSPEND)
c277df42
IZ
2435 break;
2436 }
2437 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2438 min1 = 0;
2439 if (flags & SCF_DO_SUBSTR) {
2440 data->pos_min += min1;
2441 data->pos_delta += max1 - min1;
2442 if (max1 != min1 || is_inf)
2443 data->longest = &(data->longest_float);
2444 }
2445 min += min1;
2446 delta += max1 - min1;
653099ff 2447 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2448 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2449 if (min1) {
40d049e4 2450 cl_and(data->start_class, and_withp);
653099ff
GS
2451 flags &= ~SCF_DO_STCLASS;
2452 }
2453 }
2454 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2455 if (min1) {
2456 cl_and(data->start_class, &accum);
653099ff 2457 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2458 }
2459 else {
b81d288d 2460 /* Switch to OR mode: cache the old value of
de0c8cb8 2461 * data->start_class */
40d049e4
YO
2462 INIT_AND_WITHP;
2463 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2464 struct regnode_charclass_class);
2465 flags &= ~SCF_DO_STCLASS_AND;
2466 StructCopy(&accum, data->start_class,
2467 struct regnode_charclass_class);
2468 flags |= SCF_DO_STCLASS_OR;
2469 data->start_class->flags |= ANYOF_EOS;
2470 }
653099ff 2471 }
a3621e74 2472
786e8c11 2473 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2474 /* demq.
2475
2476 Assuming this was/is a branch we are dealing with: 'scan' now
2477 points at the item that follows the branch sequence, whatever
2478 it is. We now start at the beginning of the sequence and look
2479 for subsequences of
2480
786e8c11
YO
2481 BRANCH->EXACT=>x1
2482 BRANCH->EXACT=>x2
2483 tail
a3621e74
YO
2484
2485 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2486
2487 If we can find such a subseqence we need to turn the first
2488 element into a trie and then add the subsequent branch exact
2489 strings to the trie.
2490
2491 We have two cases
2492
786e8c11 2493 1. patterns where the whole set of branch can be converted.
a3621e74 2494
786e8c11 2495 2. patterns where only a subset can be converted.
a3621e74
YO
2496
2497 In case 1 we can replace the whole set with a single regop
2498 for the trie. In case 2 we need to keep the start and end
2499 branchs so
2500
2501 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2502 becomes BRANCH TRIE; BRANCH X;
2503
786e8c11
YO
2504 There is an additional case, that being where there is a
2505 common prefix, which gets split out into an EXACT like node
2506 preceding the TRIE node.
2507
2508 If x(1..n)==tail then we can do a simple trie, if not we make
2509 a "jump" trie, such that when we match the appropriate word
2510 we "jump" to the appopriate tail node. Essentailly we turn
2511 a nested if into a case structure of sorts.
a3621e74
YO
2512
2513 */
786e8c11 2514
3dab1dad 2515 int made=0;
0111c4fd
RGS
2516 if (!re_trie_maxbuff) {
2517 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2518 if (!SvIOK(re_trie_maxbuff))
2519 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2520 }
786e8c11 2521 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2522 regnode *cur;
2523 regnode *first = (regnode *)NULL;
2524 regnode *last = (regnode *)NULL;
2525 regnode *tail = scan;
2526 U8 optype = 0;
2527 U32 count=0;
2528
2529#ifdef DEBUGGING
c445ea15 2530 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2531#endif
2532 /* var tail is used because there may be a TAIL
2533 regop in the way. Ie, the exacts will point to the
2534 thing following the TAIL, but the last branch will
2535 point at the TAIL. So we advance tail. If we
2536 have nested (?:) we may have to move through several
2537 tails.
2538 */
2539
2540 while ( OP( tail ) == TAIL ) {
2541 /* this is the TAIL generated by (?:) */
2542 tail = regnext( tail );
2543 }
2544
3dab1dad 2545
a3621e74 2546 DEBUG_OPTIMISE_r({
32fc9b6a 2547 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2548 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2549 (int)depth * 2 + 2, "",
2550 "Looking for TRIE'able sequences. Tail node is: ",
2551 SvPV_nolen_const( mysv )
a3621e74
YO
2552 );
2553 });
3dab1dad 2554
a3621e74
YO
2555 /*
2556
2557 step through the branches, cur represents each
2558 branch, noper is the first thing to be matched
2559 as part of that branch and noper_next is the
2560 regnext() of that node. if noper is an EXACT
2561 and noper_next is the same as scan (our current
2562 position in the regex) then the EXACT branch is
2563 a possible optimization target. Once we have
2564 two or more consequetive such branches we can
2565 create a trie of the EXACT's contents and stich
2566 it in place. If the sequence represents all of
2567 the branches we eliminate the whole thing and
2568 replace it with a single TRIE. If it is a
2569 subsequence then we need to stitch it in. This
2570 means the first branch has to remain, and needs
2571 to be repointed at the item on the branch chain
2572 following the last branch optimized. This could
2573 be either a BRANCH, in which case the
2574 subsequence is internal, or it could be the
2575 item following the branch sequence in which
2576 case the subsequence is at the end.
2577
2578 */
2579
2580 /* dont use tail as the end marker for this traverse */
2581 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2582 regnode * const noper = NEXTOPER( cur );
be981c67 2583#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2584 regnode * const noper_next = regnext( noper );
be981c67 2585#endif
a3621e74 2586
a3621e74 2587 DEBUG_OPTIMISE_r({
32fc9b6a 2588 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2589 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2590 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2591
32fc9b6a 2592 regprop(RExC_rx, mysv, noper);
a3621e74 2593 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2594 SvPV_nolen_const(mysv));
a3621e74
YO
2595
2596 if ( noper_next ) {
32fc9b6a 2597 regprop(RExC_rx, mysv, noper_next );
a3621e74 2598 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2599 SvPV_nolen_const(mysv));
a3621e74 2600 }
3dab1dad
YO
2601 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2602 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2603 });
3dab1dad
YO
2604 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2605 : PL_regkind[ OP( noper ) ] == EXACT )
2606 || OP(noper) == NOTHING )
786e8c11
YO
2607#ifdef NOJUMPTRIE
2608 && noper_next == tail
2609#endif
2610 && count < U16_MAX)
a3621e74
YO
2611 {
2612 count++;
3dab1dad
YO
2613 if ( !first || optype == NOTHING ) {
2614 if (!first) first = cur;
a3621e74
YO
2615 optype = OP( noper );
2616 } else {
a3621e74 2617 last = cur;
a3621e74
YO
2618 }
2619 } else {
2620 if ( last ) {
786e8c11
YO
2621 make_trie( pRExC_state,
2622 startbranch, first, cur, tail, count,
2623 optype, depth+1 );
a3621e74 2624 }
3dab1dad 2625 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2626#ifdef NOJUMPTRIE
2627 && noper_next == tail
2628#endif
2629 ){
a3621e74
YO
2630 count = 1;
2631 first = cur;
2632 optype = OP( noper );
2633 } else {
2634 count = 0;
2635 first = NULL;
2636 optype = 0;
2637 }
2638 last = NULL;
2639 }
2640 }
2641 DEBUG_OPTIMISE_r({
32fc9b6a 2642 regprop(RExC_rx, mysv, cur);
a3621e74 2643 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2644 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2645 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2646
2647 });
2648 if ( last ) {
786e8c11 2649 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2650#ifdef TRIE_STUDY_OPT
786e8c11
YO
2651 if ( ((made == MADE_EXACT_TRIE &&
2652 startbranch == first)
2653 || ( first_non_open == first )) &&
5d458dd8 2654 depth==0 ) {
786e8c11 2655 flags |= SCF_TRIE_RESTUDY;
5d458dd8
YO
2656 if ( startbranch == first
2657 && scan == tail )
2658 {
2659 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2660 }
2661 }
3dab1dad 2662#endif
07be1b83 2663 }
a3621e74 2664 }
3dab1dad
YO
2665
2666 } /* do trie */
786e8c11 2667
a0ed51b3 2668 }
a3621e74 2669 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2670 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2671 } else /* single branch is optimized. */
c277df42
IZ
2672 scan = NEXTOPER(scan);
2673 continue;
a0ed51b3
LW
2674 }
2675 else if (OP(scan) == EXACT) {
cd439c50 2676 I32 l = STR_LEN(scan);
c445ea15 2677 UV uc;
a0ed51b3 2678 if (UTF) {
a3b680e6 2679 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2680 l = utf8_length(s, s + l);
9041c2e3 2681 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2682 } else {
2683 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2684 }
2685 min += l;
c277df42 2686 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2687 /* The code below prefers earlier match for fixed
2688 offset, later match for variable offset. */
2689 if (data->last_end == -1) { /* Update the start info. */
2690 data->last_start_min = data->pos_min;
2691 data->last_start_max = is_inf
b81d288d 2692 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2693 }
cd439c50 2694 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2695 if (UTF)
2696 SvUTF8_on(data->last_found);
0eda9292 2697 {
9a957fbc 2698 SV * const sv = data->last_found;
a28509cc 2699 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2700 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2701 if (mg && mg->mg_len >= 0)
5e43f467
JH
2702 mg->mg_len += utf8_length((U8*)STRING(scan),
2703 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2704 }
c277df42
IZ
2705 data->last_end = data->pos_min + l;
2706 data->pos_min += l; /* As in the first entry. */
2707 data->flags &= ~SF_BEFORE_EOL;
2708 }
653099ff
GS
2709 if (flags & SCF_DO_STCLASS_AND) {
2710 /* Check whether it is compatible with what we know already! */
2711 int compat = 1;
2712
1aa99e6b 2713 if (uc >= 0x100 ||
516a5887 2714 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2715 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2716 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2717 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2718 )
653099ff
GS
2719 compat = 0;
2720 ANYOF_CLASS_ZERO(data->start_class);
2721 ANYOF_BITMAP_ZERO(data->start_class);
2722 if (compat)
1aa99e6b 2723 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2724 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2725 if (uc < 0x100)
2726 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2727 }
2728 else if (flags & SCF_DO_STCLASS_OR) {
2729 /* false positive possible if the class is case-folded */
1aa99e6b 2730 if (uc < 0x100)
9b877dbb
IH
2731 ANYOF_BITMAP_SET(data->start_class, uc);
2732 else
2733 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2734 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2735 cl_and(data->start_class, and_withp);
653099ff
GS
2736 }
2737 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2738 }
3dab1dad 2739 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2740 I32 l = STR_LEN(scan);
1aa99e6b 2741 UV uc = *((U8*)STRING(scan));
653099ff
GS
2742
2743 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2744 if (flags & SCF_DO_SUBSTR) {
2745 assert(data);
1de06328 2746 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2747 }
a0ed51b3 2748 if (UTF) {
6136c704 2749 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2750 l = utf8_length(s, s + l);
9041c2e3 2751 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2752 }
2753 min += l;
ecaa9b9c 2754 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2755 data->pos_min += l;
653099ff
GS
2756 if (flags & SCF_DO_STCLASS_AND) {
2757 /* Check whether it is compatible with what we know already! */
2758 int compat = 1;
2759
1aa99e6b 2760 if (uc >= 0x100 ||
516a5887 2761 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2762 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2763 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2764 compat = 0;
2765 ANYOF_CLASS_ZERO(data->start_class);
2766 ANYOF_BITMAP_ZERO(data->start_class);
2767 if (compat) {
1aa99e6b 2768 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2769 data->start_class->flags &= ~ANYOF_EOS;
2770 data->start_class->flags |= ANYOF_FOLD;
2771 if (OP(scan) == EXACTFL)
2772 data->start_class->flags |= ANYOF_LOCALE;
2773 }
2774 }
2775 else if (flags & SCF_DO_STCLASS_OR) {
2776 if (data->start_class->flags & ANYOF_FOLD) {
2777 /* false positive possible if the class is case-folded.
2778 Assume that the locale settings are the same... */
1aa99e6b
IH
2779 if (uc < 0x100)
2780 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2781 data->start_class->flags &= ~ANYOF_EOS;
2782 }
40d049e4 2783 cl_and(data->start_class, and_withp);
653099ff
GS
2784 }
2785 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2786 }
bfed75c6 2787 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2788 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2789 I32 f = flags, pos_before = 0;
d4c19fe8 2790 regnode * const oscan = scan;
653099ff
GS
2791 struct regnode_charclass_class this_class;
2792 struct regnode_charclass_class *oclass = NULL;
727f22e3 2793 I32 next_is_eval = 0;
653099ff 2794
3dab1dad 2795 switch (PL_regkind[OP(scan)]) {
653099ff 2796 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2797 scan = NEXTOPER(scan);
2798 goto finish;
2799 case PLUS:
653099ff 2800 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2801 next = NEXTOPER(scan);
653099ff 2802 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2803 mincount = 1;
2804 maxcount = REG_INFTY;
c277df42
IZ
2805 next = regnext(scan);
2806 scan = NEXTOPER(scan);
2807 goto do_curly;
2808 }
2809 }
2810 if (flags & SCF_DO_SUBSTR)
2811 data->pos_min++;
2812 min++;
2813 /* Fall through. */
2814 case STAR:
653099ff
GS
2815 if (flags & SCF_DO_STCLASS) {
2816 mincount = 0;
b81d288d 2817 maxcount = REG_INFTY;
653099ff
GS
2818 next = regnext(scan);
2819 scan = NEXTOPER(scan);
2820 goto do_curly;
2821 }
b81d288d 2822 is_inf = is_inf_internal = 1;
c277df42
IZ
2823 scan = regnext(scan);
2824 if (flags & SCF_DO_SUBSTR) {
1de06328 2825 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2826 data->longest = &(data->longest_float);
2827 }
2828 goto optimize_curly_tail;
2829 case CURLY:
40d049e4
YO
2830 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2831 && (scan->flags == stopparen))
2832 {
2833 mincount = 1;
2834 maxcount = 1;
2835 } else {
2836 mincount = ARG1(scan);
2837 maxcount = ARG2(scan);
2838 }
c277df42 2839 next = regnext(scan);
cb434fcc
IZ
2840 if (OP(scan) == CURLYX) {
2841 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2842 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2843 }
c277df42 2844 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2845 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2846 do_curly:
2847 if (flags & SCF_DO_SUBSTR) {
1de06328 2848 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2849 pos_before = data->pos_min;
2850 }
2851 if (data) {
2852 fl = data->flags;
2853 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2854 if (is_inf)
2855 data->flags |= SF_IS_INF;
2856 }
653099ff 2857 if (flags & SCF_DO_STCLASS) {
830247a4 2858 cl_init(pRExC_state, &this_class);
653099ff
GS
2859 oclass = data->start_class;
2860 data->start_class = &this_class;
2861 f |= SCF_DO_STCLASS_AND;
2862 f &= ~SCF_DO_STCLASS_OR;
2863 }
e1901655
IZ
2864 /* These are the cases when once a subexpression
2865 fails at a particular position, it cannot succeed
2866 even after backtracking at the enclosing scope.
b81d288d 2867
e1901655
IZ
2868 XXXX what if minimal match and we are at the
2869 initial run of {n,m}? */
2870 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2871 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2872
c277df42 2873 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2874 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2875 last, data, stopparen, recursed, NULL,
a3621e74
YO
2876 (mincount == 0
2877 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2878
2879 if (flags & SCF_DO_STCLASS)
2880 data->start_class = oclass;
2881 if (mincount == 0 || minnext == 0) {
2882 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2883 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2884 }
2885 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2886 /* Switch to OR mode: cache the old value of
653099ff 2887 * data->start_class */
40d049e4
YO
2888 INIT_AND_WITHP;
2889 StructCopy(data->start_class, and_withp,
653099ff
GS
2890 struct regnode_charclass_class);
2891 flags &= ~SCF_DO_STCLASS_AND;
2892 StructCopy(&this_class, data->start_class,
2893 struct regnode_charclass_class);
2894 flags |= SCF_DO_STCLASS_OR;
2895 data->start_class->flags |= ANYOF_EOS;
2896 }
2897 } else { /* Non-zero len */
2898 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2899 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2900 cl_and(data->start_class, and_withp);
653099ff
GS
2901 }
2902 else if (flags & SCF_DO_STCLASS_AND)
2903 cl_and(data->start_class, &this_class);
2904 flags &= ~SCF_DO_STCLASS;
2905 }
c277df42
IZ
2906 if (!scan) /* It was not CURLYX, but CURLY. */
2907 scan = next;
041457d9
DM
2908 if ( /* ? quantifier ok, except for (?{ ... }) */
2909 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2910 && (minnext == 0) && (deltanext == 0)
99799961 2911 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2912 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2913 && ckWARN(WARN_REGEXP))
b45f050a 2914 {
830247a4 2915 vWARN(RExC_parse,
b45f050a
JF
2916 "Quantifier unexpected on zero-length expression");
2917 }
2918
c277df42 2919 min += minnext * mincount;
b81d288d 2920 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2921 && (minnext + deltanext) > 0)
2922 || deltanext == I32_MAX);
aca2d497 2923 is_inf |= is_inf_internal;
c277df42
IZ
2924 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2925
2926 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2927 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2928 && data->flags & SF_IN_PAR
2929 && !(data->flags & SF_HAS_EVAL)
2930 && !deltanext && minnext == 1 ) {
2931 /* Try to optimize to CURLYN. */
2932 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2933 regnode * const nxt1 = nxt;
497b47a8
JH
2934#ifdef DEBUGGING
2935 regnode *nxt2;
2936#endif
c277df42
IZ
2937
2938 /* Skip open. */
2939 nxt = regnext(nxt);
bfed75c6 2940 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2941 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2942 && STR_LEN(nxt) == 1))
c277df42 2943 goto nogo;
497b47a8 2944#ifdef DEBUGGING
c277df42 2945 nxt2 = nxt;
497b47a8 2946#endif
c277df42 2947 nxt = regnext(nxt);
b81d288d 2948 if (OP(nxt) != CLOSE)
c277df42 2949 goto nogo;
40d049e4
YO
2950 if (RExC_open_parens) {
2951 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2952 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2953 }
c277df42 2954 /* Now we know that nxt2 is the only contents: */
eb160463 2955 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2956 OP(oscan) = CURLYN;
2957 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2958
c277df42
IZ
2959#ifdef DEBUGGING
2960 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2961 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2962 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2963 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2964 OP(nxt + 1) = OPTIMIZED; /* was count. */
2965 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2966#endif
c277df42 2967 }
c277df42
IZ
2968 nogo:
2969
2970 /* Try optimization CURLYX => CURLYM. */
b81d288d 2971 if ( OP(oscan) == CURLYX && data
c277df42 2972 && !(data->flags & SF_HAS_PAR)
c277df42 2973 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2974 && !deltanext /* atom is fixed width */
2975 && minnext != 0 /* CURLYM can't handle zero width */
2976 ) {
c277df42
IZ
2977 /* XXXX How to optimize if data == 0? */
2978 /* Optimize to a simpler form. */
2979 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2980 regnode *nxt2;
2981
2982 OP(oscan) = CURLYM;
2983 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2984 && (OP(nxt2) != WHILEM))
c277df42
IZ
2985 nxt = nxt2;
2986 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2987 /* Need to optimize away parenths. */
2988 if (data->flags & SF_IN_PAR) {
2989 /* Set the parenth number. */
2990 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2991
b81d288d 2992 if (OP(nxt) != CLOSE)
b45f050a 2993 FAIL("Panic opt close");
eb160463 2994 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2995 if (RExC_open_parens) {
2996 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2997 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2998 }
c277df42
IZ
2999 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3000 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3001
c277df42
IZ
3002#ifdef DEBUGGING
3003 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3004 OP(nxt + 1) = OPTIMIZED; /* was count. */
3005 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3006 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3007#endif
c277df42
IZ
3008#if 0
3009 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3010 regnode *nnxt = regnext(nxt1);
b81d288d 3011
c277df42
IZ
3012 if (nnxt == nxt) {
3013 if (reg_off_by_arg[OP(nxt1)])
3014 ARG_SET(nxt1, nxt2 - nxt1);
3015 else if (nxt2 - nxt1 < U16_MAX)
3016 NEXT_OFF(nxt1) = nxt2 - nxt1;
3017 else
3018 OP(nxt) = NOTHING; /* Cannot beautify */
3019 }
3020 nxt1 = nnxt;
3021 }
3022#endif
3023 /* Optimize again: */
1de06328 3024 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3025 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3026 }
3027 else
c277df42 3028 oscan->flags = 0;
c277df42 3029 }
e1901655
IZ
3030 else if ((OP(oscan) == CURLYX)
3031 && (flags & SCF_WHILEM_VISITED_POS)
3032 /* See the comment on a similar expression above.
3033 However, this time it not a subexpression
3034 we care about, but the expression itself. */
3035 && (maxcount == REG_INFTY)
3036 && data && ++data->whilem_c < 16) {
3037 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3038 /* Find WHILEM (as in regexec.c) */
3039 regnode *nxt = oscan + NEXT_OFF(oscan);
3040
3041 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3042 nxt += ARG(nxt);
eb160463
GS
3043 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3044 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3045 }
b81d288d 3046 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3047 pars++;
3048 if (flags & SCF_DO_SUBSTR) {
c445ea15 3049 SV *last_str = NULL;
c277df42
IZ
3050 int counted = mincount != 0;
3051
3052 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3053#if defined(SPARC64_GCC_WORKAROUND)
3054 I32 b = 0;
3055 STRLEN l = 0;
cfd0369c 3056 const char *s = NULL;
5d1c421c
JH
3057 I32 old = 0;
3058
3059 if (pos_before >= data->last_start_min)
3060 b = pos_before;
3061 else
3062 b = data->last_start_min;
3063
3064 l = 0;
cfd0369c 3065 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3066 old = b - data->last_start_min;
3067
3068#else
b81d288d 3069 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3070 ? pos_before : data->last_start_min;
3071 STRLEN l;
d4c19fe8 3072 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3073 I32 old = b - data->last_start_min;
5d1c421c 3074#endif
a0ed51b3
LW
3075
3076 if (UTF)
3077 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3078
a0ed51b3 3079 l -= old;
c277df42 3080 /* Get the added string: */
79cb57f6 3081 last_str = newSVpvn(s + old, l);
0e933229
IH
3082 if (UTF)
3083 SvUTF8_on(last_str);
c277df42
IZ
3084 if (deltanext == 0 && pos_before == b) {
3085 /* What was added is a constant string */
3086 if (mincount > 1) {
3087 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3088 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3089 SvPVX_const(last_str), l, mincount - 1);
b162af07 3090 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3091 /* Add additional parts. */
b81d288d 3092 SvCUR_set(data->last_found,
c277df42
IZ
3093 SvCUR(data->last_found) - l);
3094 sv_catsv(data->last_found, last_str);
0eda9292
JH
3095 {
3096 SV * sv = data->last_found;
3097 MAGIC *mg =
3098 SvUTF8(sv) && SvMAGICAL(sv) ?
3099 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3100 if (mg && mg->mg_len >= 0)
3101 mg->mg_len += CHR_SVLEN(last_str);
3102 }
c277df42
IZ
3103 data->last_end += l * (mincount - 1);
3104 }
2a8d9689
HS
3105 } else {
3106 /* start offset must point into the last copy */
3107 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3108 data->last_start_max += is_inf ? I32_MAX
3109 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3110 }
3111 }
3112 /* It is counted once already... */
3113 data->pos_min += minnext * (mincount - counted);
3114 data->pos_delta += - counted * deltanext +
3115 (minnext + deltanext) * maxcount - minnext * mincount;
3116 if (mincount != maxcount) {
653099ff
GS
3117 /* Cannot extend fixed substrings found inside
3118 the group. */
1de06328 3119 scan_commit(pRExC_state,data,minlenp);
c277df42 3120 if (mincount && last_str) {
d4c19fe8
AL
3121 SV * const sv = data->last_found;
3122 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3123 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3124
3125 if (mg)
3126 mg->mg_len = -1;
3127 sv_setsv(sv, last_str);
c277df42 3128 data->last_end = data->pos_min;
b81d288d 3129 data->last_start_min =
a0ed51b3 3130 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3131 data->last_start_max = is_inf
3132 ? I32_MAX
c277df42 3133 : data->pos_min + data->pos_delta
a0ed51b3 3134 - CHR_SVLEN(last_str);
c277df42
IZ
3135 }
3136 data->longest = &(data->longest_float);
3137 }
aca2d497 3138 SvREFCNT_dec(last_str);
c277df42 3139 }
405ff068 3140 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3141 data->flags |= SF_HAS_EVAL;
3142 optimize_curly_tail:
c277df42 3143 if (OP(oscan) != CURLYX) {
3dab1dad 3144 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3145 && NEXT_OFF(next))
3146 NEXT_OFF(oscan) += NEXT_OFF(next);
3147 }
c277df42 3148 continue;
653099ff 3149 default: /* REF and CLUMP only? */
c277df42 3150 if (flags & SCF_DO_SUBSTR) {
1de06328 3151 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3152 data->longest = &(data->longest_float);
3153 }
aca2d497 3154 is_inf = is_inf_internal = 1;
653099ff 3155 if (flags & SCF_DO_STCLASS_OR)
830247a4 3156 cl_anything(pRExC_state, data->start_class);
653099ff 3157 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3158 break;
3159 }
a0ed51b3 3160 }
bfed75c6 3161 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3162 int value = 0;
653099ff 3163
c277df42 3164 if (flags & SCF_DO_SUBSTR) {
1de06328 3165 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3166 data->pos_min++;
3167 }
3168 min++;
653099ff
GS
3169 if (flags & SCF_DO_STCLASS) {
3170 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3171
3172 /* Some of the logic below assumes that switching
3173 locale on will only add false positives. */
3dab1dad 3174 switch (PL_regkind[OP(scan)]) {
653099ff 3175 case SANY:
653099ff
GS
3176 default:
3177 do_default:
3178 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3179 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3180 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3181 break;
3182 case REG_ANY:
3183 if (OP(scan) == SANY)
3184 goto do_default;
3185 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3186 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3187 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3188 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3189 }
3190 if (flags & SCF_DO_STCLASS_AND || !value)
3191 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3192 break;
3193 case ANYOF:
3194 if (flags & SCF_DO_STCLASS_AND)
3195 cl_and(data->start_class,
3196 (struct regnode_charclass_class*)scan);
3197 else
830247a4 3198 cl_or(pRExC_state, data->start_class,
653099ff
GS
3199 (struct regnode_charclass_class*)scan);
3200 break;
3201 case ALNUM:
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3204 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3205 for (value = 0; value < 256; value++)
3206 if (!isALNUM(value))
3207 ANYOF_BITMAP_CLEAR(data->start_class, value);
3208 }
3209 }
3210 else {
3211 if (data->start_class->flags & ANYOF_LOCALE)
3212 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3213 else {
3214 for (value = 0; value < 256; value++)
3215 if (isALNUM(value))
b81d288d 3216 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3217 }
3218 }
3219 break;
3220 case ALNUML:
3221 if (flags & SCF_DO_STCLASS_AND) {
3222 if (data->start_class->flags & ANYOF_LOCALE)
3223 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3224 }
3225 else {
3226 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3227 data->start_class->flags |= ANYOF_LOCALE;
3228 }
3229 break;
3230 case NALNUM:
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3233 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3234 for (value = 0; value < 256; value++)
3235 if (isALNUM(value))
3236 ANYOF_BITMAP_CLEAR(data->start_class, value);
3237 }
3238 }
3239 else {
3240 if (data->start_class->flags & ANYOF_LOCALE)
3241 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3242 else {
3243 for (value = 0; value < 256; value++)
3244 if (!isALNUM(value))
b81d288d 3245 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3246 }
3247 }
3248 break;
3249 case NALNUML:
3250 if (flags & SCF_DO_STCLASS_AND) {
3251 if (data->start_class->flags & ANYOF_LOCALE)
3252 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3253 }
3254 else {
3255 data->start_class->flags |= ANYOF_LOCALE;
3256 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3257 }
3258 break;
3259 case SPACE:
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3262 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3263 for (value = 0; value < 256; value++)
3264 if (!isSPACE(value))
3265 ANYOF_BITMAP_CLEAR(data->start_class, value);
3266 }
3267 }
3268 else {
3269 if (data->start_class->flags & ANYOF_LOCALE)
3270 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3271 else {
3272 for (value = 0; value < 256; value++)
3273 if (isSPACE(value))
b81d288d 3274 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3275 }
3276 }
3277 break;
3278 case SPACEL:
3279 if (flags & SCF_DO_STCLASS_AND) {
3280 if (data->start_class->flags & ANYOF_LOCALE)
3281 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3282 }
3283 else {
3284 data->start_class->flags |= ANYOF_LOCALE;
3285 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3286 }
3287 break;
3288 case NSPACE:
3289 if (flags & SCF_DO_STCLASS_AND) {
3290 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3291 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3292 for (value = 0; value < 256; value++)
3293 if (isSPACE(value))
3294 ANYOF_BITMAP_CLEAR(data->start_class, value);
3295 }
3296 }
3297 else {
3298 if (data->start_class->flags & ANYOF_LOCALE)
3299 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3300 else {
3301 for (value = 0; value < 256; value++)
3302 if (!isSPACE(value))
b81d288d 3303 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3304 }
3305 }
3306 break;
3307 case NSPACEL:
3308 if (flags & SCF_DO_STCLASS_AND) {
3309 if (data->start_class->flags & ANYOF_LOCALE) {
3310 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3311 for (value = 0; value < 256; value++)
3312 if (!isSPACE(value))
3313 ANYOF_BITMAP_CLEAR(data->start_class, value);
3314 }
3315 }
3316 else {
3317 data->start_class->flags |= ANYOF_LOCALE;
3318 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3319 }
3320 break;
3321 case DIGIT:
3322 if (flags & SCF_DO_STCLASS_AND) {
3323 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3324 for (value = 0; value < 256; value++)
3325 if (!isDIGIT(value))
3326 ANYOF_BITMAP_CLEAR(data->start_class, value);
3327 }
3328 else {
3329 if (data->start_class->flags & ANYOF_LOCALE)
3330 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3331 else {
3332 for (value = 0; value < 256; value++)
3333 if (isDIGIT(value))
b81d288d 3334 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3335 }
3336 }
3337 break;
3338 case NDIGIT:
3339 if (flags & SCF_DO_STCLASS_AND) {
3340 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3341 for (value = 0; value < 256; value++)
3342 if (isDIGIT(value))
3343 ANYOF_BITMAP_CLEAR(data->start_class, value);
3344 }
3345 else {
3346 if (data->start_class->flags & ANYOF_LOCALE)
3347 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3348 else {
3349 for (value = 0; value < 256; value++)
3350 if (!isDIGIT(value))
b81d288d 3351 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3352 }
3353 }
3354 break;
3355 }
3356 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3357 cl_and(data->start_class, and_withp);
653099ff
GS
3358 flags &= ~SCF_DO_STCLASS;
3359 }
a0ed51b3 3360 }
3dab1dad 3361 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3362 data->flags |= (OP(scan) == MEOL
3363 ? SF_BEFORE_MEOL
3364 : SF_BEFORE_SEOL);
a0ed51b3 3365 }
3dab1dad 3366 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3367 /* Lookbehind, or need to calculate parens/evals/stclass: */
3368 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3369 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3370 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3371 || OP(scan) == UNLESSM )
3372 {
3373 /* Negative Lookahead/lookbehind
3374 In this case we can't do fixed string optimisation.
3375 */
3376
3377 I32 deltanext, minnext, fake = 0;
3378 regnode *nscan;
3379 struct regnode_charclass_class intrnl;
3380 int f = 0;
3381
3382 data_fake.flags = 0;
3383 if (data) {
3384 data_fake.whilem_c = data->whilem_c;
3385 data_fake.last_closep = data->last_closep;
a0ed51b3 3386 }
1de06328
YO
3387 else
3388 data_fake.last_closep = &fake;
3389 if ( flags & SCF_DO_STCLASS && !scan->flags
3390 && OP(scan) == IFMATCH ) { /* Lookahead */
3391 cl_init(pRExC_state, &intrnl);
3392 data_fake.start_class = &intrnl;
3393 f |= SCF_DO_STCLASS_AND;
c277df42 3394 }
1de06328
YO
3395 if (flags & SCF_WHILEM_VISITED_POS)
3396 f |= SCF_WHILEM_VISITED_POS;
3397 next = regnext(scan);
3398 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3399 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3400 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3401 if (scan->flags) {
3402 if (deltanext) {
3403 vFAIL("Variable length lookbehind not implemented");
3404 }
3405 else if (minnext > (I32)U8_MAX) {
3406 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3407 }
3408 scan->flags = (U8)minnext;
3409 }
3410 if (data) {
3411 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3412 pars++;
3413 if (data_fake.flags & SF_HAS_EVAL)
3414 data->flags |= SF_HAS_EVAL;
3415 data->whilem_c = data_fake.whilem_c;
3416 }
3417 if (f & SCF_DO_STCLASS_AND) {
3418 const int was = (data->start_class->flags & ANYOF_EOS);
3419
3420 cl_and(data->start_class, &intrnl);
3421 if (was)
3422 data->start_class->flags |= ANYOF_EOS;
3423 }
be8e71aa 3424 }
1de06328
YO
3425#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3426 else {
3427 /* Positive Lookahead/lookbehind
3428 In this case we can do fixed string optimisation,
3429 but we must be careful about it. Note in the case of
3430 lookbehind the positions will be offset by the minimum
3431 length of the pattern, something we won't know about
3432 until after the recurse.
3433 */
3434 I32 deltanext, fake = 0;
3435 regnode *nscan;
3436 struct regnode_charclass_class intrnl;
3437 int f = 0;
3438 /* We use SAVEFREEPV so that when the full compile
3439 is finished perl will clean up the allocated
3440 minlens when its all done. This was we don't
3441 have to worry about freeing them when we know
3442 they wont be used, which would be a pain.
3443 */
3444 I32 *minnextp;
3445 Newx( minnextp, 1, I32 );
3446 SAVEFREEPV(minnextp);
3447
3448 if (data) {
3449 StructCopy(data, &data_fake, scan_data_t);
3450 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3451 f |= SCF_DO_SUBSTR;
3452 if (scan->flags)
3453 scan_commit(pRExC_state, &data_fake,minlenp);
3454 data_fake.last_found=newSVsv(data->last_found);
3455 }
3456 }
3457 else
3458 data_fake.last_closep = &fake;
3459 data_fake.flags = 0;
3460 if (is_inf)
3461 data_fake.flags |= SF_IS_INF;
3462 if ( flags & SCF_DO_STCLASS && !scan->flags
3463 && OP(scan) == IFMATCH ) { /* Lookahead */
3464 cl_init(pRExC_state, &intrnl);
3465 data_fake.start_class = &intrnl;
3466 f |= SCF_DO_STCLASS_AND;
3467 }
3468 if (flags & SCF_WHILEM_VISITED_POS)
3469 f |= SCF_WHILEM_VISITED_POS;
3470 next = regnext(scan);
3471 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3472
3473 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3474 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3475 if (scan->flags) {
3476 if (deltanext) {
3477 vFAIL("Variable length lookbehind not implemented");
3478 }
3479 else if (*minnextp > (I32)U8_MAX) {
3480 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3481 }
3482 scan->flags = (U8)*minnextp;
3483 }
40d049e4 3484
1de06328 3485 *minnextp += min;
40d049e4 3486
1de06328
YO
3487 if (f & SCF_DO_STCLASS_AND) {
3488 const int was = (data->start_class->flags & ANYOF_EOS);
3489
3490 cl_and(data->start_class, &intrnl);
3491 if (was)
3492 data->start_class->flags |= ANYOF_EOS;
40d049e4 3493 }
1de06328
YO
3494 if (data) {
3495 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3496 pars++;
3497 if (data_fake.flags & SF_HAS_EVAL)
3498 data->flags |= SF_HAS_EVAL;
3499 data->whilem_c = data_fake.whilem_c;
3500 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3501 if (RExC_rx->minlen<*minnextp)
3502 RExC_rx->minlen=*minnextp;
3503 scan_commit(pRExC_state, &data_fake, minnextp);
3504 SvREFCNT_dec(data_fake.last_found);
3505
3506 if ( data_fake.minlen_fixed != minlenp )
3507 {
3508 data->offset_fixed= data_fake.offset_fixed;
3509 data->minlen_fixed= data_fake.minlen_fixed;
3510 data->lookbehind_fixed+= scan->flags;
3511 }
3512 if ( data_fake.minlen_float != minlenp )
3513 {
3514 data->minlen_float= data_fake.minlen_float;
3515 data->offset_float_min=data_fake.offset_float_min;
3516 data->offset_float_max=data_fake.offset_float_max;
3517 data->lookbehind_float+= scan->flags;
3518 }
3519 }
3520 }
3521
653099ff 3522
653099ff 3523 }
1de06328 3524#endif
a0ed51b3
LW
3525 }
3526 else if (OP(scan) == OPEN) {
40d049e4
YO
3527 if (stopparen != (I32)ARG(scan))
3528 pars++;
a0ed51b3 3529 }
cb434fcc 3530 else if (OP(scan) == CLOSE) {
40d049e4
YO
3531 if (stopparen == (I32)ARG(scan)) {
3532 break;
3533 }
eb160463 3534 if ((I32)ARG(scan) == is_par) {
cb434fcc 3535 next = regnext(scan);
c277df42 3536
cb434fcc
IZ
3537 if ( next && (OP(next) != WHILEM) && next < last)
3538 is_par = 0; /* Disable optimization */
3539 }
3540 if (data)
3541 *(data->last_closep) = ARG(scan);
a0ed51b3 3542 }
1a147d38 3543 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3544 /* set the pointer */
3545 I32 paren;
3546 regnode *start;
3547 regnode *end;
1a147d38 3548 if (OP(scan) == GOSUB) {
40d049e4
YO
3549 paren = ARG(scan);
3550 RExC_recurse[ARG2L(scan)] = scan;
3551 start = RExC_open_parens[paren-1];
3552 end = RExC_close_parens[paren-1];
3553 } else {
3554 paren = 0;
3555 start = RExC_rx->program + 1;
3556 end = RExC_opend;
3557 }
3558 assert(start);
3559 assert(end);
3560 if (!recursed) {
3561 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3562 SAVEFREEPV(recursed);
3563 }
3564 if (!PAREN_TEST(recursed,paren+1)) {
3565 I32 deltanext = 0;
3566 PAREN_SET(recursed,paren+1);
3567
3568 DEBUG_PEEP("goto",start,depth);
3569 min += study_chunk(
3570 pRExC_state,
3571 &start,
3572 minlenp,
3573 &deltanext,
3574 end+1,
3575 data,
3576 paren,
3577 recursed,
3578 and_withp,
3579 flags,depth+1);
3580 delta+=deltanext;
3581 if (deltanext == I32_MAX) {
3582 is_inf = is_inf_internal = 1;
3583 delta=deltanext;
3584 }
3585 DEBUG_PEEP("rtrn",end,depth);
3586 PAREN_UNSET(recursed,paren+1);
3587 } else {
3588 if (flags & SCF_DO_SUBSTR) {
3589 scan_commit(pRExC_state,data,minlenp);
3590 data->longest = &(data->longest_float);
3591 }
3592 is_inf = is_inf_internal = 1;
3593 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3594 cl_anything(pRExC_state, data->start_class);
3595 flags &= ~SCF_DO_STCLASS;
3596 }
3597 }
a0ed51b3 3598 else if (OP(scan) == EVAL) {
c277df42
IZ
3599 if (data)
3600 data->flags |= SF_HAS_EVAL;
3601 }
e2e6a0f1 3602 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
40d049e4
YO
3603 if (flags & SCF_DO_SUBSTR) {
3604 scan_commit(pRExC_state,data,minlenp);
3605 flags &= ~SCF_DO_SUBSTR;
3606 }
e2e6a0f1
YO
3607 if (data && OP(scan)==ACCEPT) {
3608 data->flags |= SCF_SEEN_ACCEPT;
3609 if (stopmin > min)
3610 stopmin = min;
3611 }
40d049e4
YO
3612 }
3613 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3614 {
0f5d15d6 3615 if (flags & SCF_DO_SUBSTR) {
1de06328 3616 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3617 data->longest = &(data->longest_float);
3618 }
3619 is_inf = is_inf_internal = 1;
653099ff 3620 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3621 cl_anything(pRExC_state, data->start_class);
96776eda 3622 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3623 }
786e8c11 3624#ifdef TRIE_STUDY_OPT
40d049e4 3625#ifdef FULL_TRIE_STUDY
786e8c11
YO
3626 else if (PL_regkind[OP(scan)] == TRIE) {
3627 /* NOTE - There is similar code to this block above for handling
40d049e4 3628 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3629 check there too. */
7f69552c 3630 regnode *trie_node= scan;
786e8c11
YO
3631 regnode *tail= regnext(scan);
3632 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3633 I32 max1 = 0, min1 = I32_MAX;
3634 struct regnode_charclass_class accum;
3635
3636 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3637 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3638 if (flags & SCF_DO_STCLASS)
3639 cl_init_zero(pRExC_state, &accum);
3640
3641 if (!trie->jump) {
3642 min1= trie->minlen;
3643 max1= trie->maxlen;
3644 } else {
3645 const regnode *nextbranch= NULL;
3646 U32 word;
3647
3648 for ( word=1 ; word <= trie->wordcount ; word++)
3649 {
3650 I32 deltanext=0, minnext=0, f = 0, fake;
3651 struct regnode_charclass_class this_class;
3652
3653 data_fake.flags = 0;
3654 if (data) {
3655 data_fake.whilem_c = data->whilem_c;
3656 data_fake.last_closep = data->last_closep;
3657 }
3658 else
3659 data_fake.last_closep = &fake;
3660
3661 if (flags & SCF_DO_STCLASS) {
3662 cl_init(pRExC_state, &this_class);
3663 data_fake.start_class = &this_class;
3664 f = SCF_DO_STCLASS_AND;
3665 }
3666 if (flags & SCF_WHILEM_VISITED_POS)
3667 f |= SCF_WHILEM_VISITED_POS;
3668
3669 if (trie->jump[word]) {
3670 if (!nextbranch)
7f69552c
YO
3671 nextbranch = trie_node + trie->jump[0];
3672 scan= trie_node + trie->jump[word];
786e8c11
YO
3673 /* We go from the jump point to the branch that follows
3674 it. Note this means we need the vestigal unused branches
3675 even though they arent otherwise used.
3676 */
40d049e4
YO
3677 minnext = study_chunk(pRExC_state, &scan, minlenp,
3678 &deltanext, (regnode *)nextbranch, &data_fake,
3679 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3680 }
3681 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3682 nextbranch= regnext((regnode*)nextbranch);
3683
3684 if (min1 > (I32)(minnext + trie->minlen))
3685 min1 = minnext + trie->minlen;
3686 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3687 max1 = minnext + deltanext + trie->maxlen;
3688 if (deltanext == I32_MAX)
3689 is_inf = is_inf_internal = 1;
3690
3691 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3692 pars++;
e2e6a0f1
YO
3693 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3694 if ( stopmin > min + min1)