This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update skip count for VMS.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 553 PerlIO_printf(Perl_debug_log, \
b515a41d 554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1897 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1898 DEBUG_OPTIMISE_r(
8e11feef
RGS
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
786e8c11 1902 (UV)state, (UV)idx, ch)
3dab1dad
YO
1903 );
1904 if ( state==1 ) {
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1907 STR_LEN(convert)=0;
1908 }
1909 *str++=*ch;
1910 STR_LEN(convert)++;
a3621e74 1911
8e11feef 1912 } else {
f9049ba1 1913#ifdef DEBUGGING
8e11feef
RGS
1914 if (state>1)
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1916#endif
8e11feef
RGS
1917 break;
1918 }
1919 }
3dab1dad 1920 if (str) {
8e11feef 1921 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1923 trie->startstate = state;
07be1b83
YO
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1926 DEBUG_r({
1927 regnode *fix = convert;
1928 mjd_nodelen++;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1932 }
1933 });
8e11feef
RGS
1934 if (trie->maxlen) {
1935 convert = n;
1936 } else {
3dab1dad 1937 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1938 DEBUG_r(optimize= n);
3dab1dad
YO
1939 }
1940 }
1941 }
a5ca303d
YO
1942 if (!jumper)
1943 jumper = last;
3dab1dad 1944 if ( trie->maxlen ) {
8e11feef
RGS
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
786e8c11
YO
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1950 if (trie->jump)
7f69552c 1951 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1952
786e8c11
YO
1953 /* XXXX */
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1956 {
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1960 trie->bitmap= NULL;
1961 } else
1962 OP( convert ) = TRIE;
a3621e74 1963
3dab1dad
YO
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
a5ca303d
YO
1966 DEBUG_r({
1967 optimize = convert
1968 + NODE_STEP_REGNODE
1969 + regarglen[ OP( convert ) ];
1970 });
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
3dab1dad 1973 }
a3621e74 1974 /* needed for dumping*/
e62cc96a 1975 DEBUG_r(if (optimize) {
07be1b83 1976 regnode *opt = convert;
e62cc96a 1977 while ( ++opt < optimize) {
07be1b83
YO
1978 Set_Node_Offset_Length(opt,0,0);
1979 }
786e8c11
YO
1980 /*
1981 Try to clean up some of the debris left after the
1982 optimisation.
a3621e74 1983 */
786e8c11 1984 while( optimize < jumper ) {
07be1b83 1985 mjd_nodelen += Node_Length((optimize));
a3621e74 1986 OP( optimize ) = OPTIMIZED;
07be1b83 1987 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1988 optimize++;
1989 }
07be1b83 1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1991 });
1992 } /* end node insert */
07be1b83 1993#ifndef DEBUGGING
6e8b4190 1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1995#endif
786e8c11
YO
1996 return trie->jump
1997 ? MADE_JUMP_TRIE
1998 : trie->startstate>1
1999 ? MADE_EXACT_TRIE
2000 : MADE_TRIE;
2001}
2002
2003STATIC void
2004S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2005{
2006/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2007
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 ISBN 0-201-10088-6
2011
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2017 Consider
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2022 */
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2025 U32 *q;
2026 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2027 const U32 numstates = trie->statecount;
786e8c11
YO
2028 const U32 ubound = trie->lasttrans + ucharcount;
2029 U32 q_read = 0;
2030 U32 q_write = 0;
2031 U32 charid;
2032 U32 base = trie->states[ 1 ].trans.base;
2033 U32 *fail;
2034 reg_ac_data *aho;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2037#ifndef DEBUGGING
2038 PERL_UNUSED_ARG(depth);
2039#endif
2040
2041
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2045 aho->trie=trie;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2047 numstates * sizeof(reg_trie_state));
786e8c11
YO
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2050 aho->refcount = 1;
2051 fail = aho->fail;
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2055
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2058 if ( newstate ) {
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2062 }
2063 }
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2067
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2070 if (ch_state) {
2071 U32 fail_state = cur;
2072 U32 fail_base;
2073 do {
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2077
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2081 {
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2083 }
2084 q[ q_write++ % numstates] = ch_state;
2085 }
2086 }
2087 }
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2093 */
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2099 );
786e8c11
YO
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2102 }
2103 PerlIO_printf(Perl_debug_log, "\n");
2104 });
2105 Safefree(q);
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2107}
2108
786e8c11 2109
a3621e74 2110/*
5d1c421c
JH
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2113 */
2114#if defined(__sparc64__) && defined(__GNUC__)
2115# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116# undef SPARC64_GCC_WORKAROUND
2117# define SPARC64_GCC_WORKAROUND 1
2118# endif
2119#endif
2120
07be1b83 2121#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2122 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
7f69552c 2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2129 }});
07be1b83 2130
1de06328
YO
2131
2132
2133
2134
07be1b83
YO
2135#define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138
be8e71aa 2139STATIC U32
07be1b83
YO
2140S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2143 U32 stringok = 1;
2144 regnode *next = scan + NODE_SZ_STR(scan);
2145 U32 merged = 0;
2146 U32 stopnow = 0;
2147#ifdef DEBUGGING
2148 regnode *stop = scan;
72f13be8 2149 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2150#else
d47053eb
RGS
2151 PERL_UNUSED_ARG(depth);
2152#endif
2153#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
07be1b83 2156#endif
07be1b83
YO
2157 DEBUG_PEEP("join",scan,depth);
2158
2159 /* Skip NOTHING, merge EXACT*. */
2160 while (n &&
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2163 && NEXT_OFF(n)
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2165
2166 if (OP(n) == TAIL || n > next)
2167 stringok = 0;
2168 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2172#ifdef DEBUGGING
2173 if (stringok)
2174 stop = n;
2175#endif
2176 n = regnext(n);
2177 }
2178 else if (stringok) {
786e8c11 2179 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2180 regnode * const nnext = regnext(n);
2181
2182 DEBUG_PEEP("merg",n,depth);
2183
2184 merged++;
2185 if (oldl + STR_LEN(n) > U8_MAX)
2186 break;
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2192#ifdef DEBUGGING
2193 stop = next - 1;
2194#endif
2195 n = nnext;
2196 if (stopnow) break;
2197 }
2198
d47053eb
RGS
2199#ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2204 }
2205 else {
2206 NEXT_OFF(n) = val - n;
2207 }
2208 stopnow = 1;
2209 }
07be1b83
YO
2210#endif
2211 }
2212
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2214 /*
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2216
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2219
2220 which casefold to
2221
2222 Unicode UTF-8
2223
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2226
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2232
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2236
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2240
2241 */
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
e294cc5d
JH
2245#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2247#else
07be1b83 2248 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2249#endif
07be1b83
YO
2250 const char * const t1 = t0 + 3;
2251
2252 for (s = s0 + 2;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 s = t + 4) {
e294cc5d
JH
2255#ifdef EBCDIC
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2258#else
07be1b83
YO
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2261#endif
07be1b83
YO
2262 *min -= 4;
2263 }
2264 }
2265
2266#ifdef DEBUGGING
2267 /* Allow dumping */
2268 n = scan + NODE_SZ_STR(scan);
2269 while (n <= stop) {
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2271 OP(n) = OPTIMIZED;
2272 NEXT_OFF(n) = 0;
2273 }
2274 n++;
2275 }
2276#endif
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 return stopnow;
2279}
2280
653099ff
GS
2281/* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2283
a0288114 2284/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2285 to the position after last scanned or to NULL. */
2286
40d049e4
YO
2287#define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
07be1b83 2291
b515a41d
YO
2292/* this is a chain of data about sub patterns we are processing that
2293 need to be handled seperately/specially in study_chunk. Its so
2294 we can simulate recursion without losing state. */
2295struct scan_frame;
2296typedef struct scan_frame {
2297 regnode *last; /* last node to process in this frame */
2298 regnode *next; /* next node to process when last is reached */
2299 struct scan_frame *prev; /*previous frame*/
2300 I32 stop; /* what stopparen do we use */
2301} scan_frame;
2302
76e3520e 2303STATIC I32
40d049e4 2304S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2305 I32 *minlenp, I32 *deltap,
40d049e4
YO
2306 regnode *last,
2307 scan_data_t *data,
2308 I32 stopparen,
2309 U8* recursed,
2310 struct regnode_charclass_class *and_withp,
2311 U32 flags, U32 depth)
c277df42
IZ
2312 /* scanp: Start here (read-write). */
2313 /* deltap: Write maxlen-minlen here. */
2314 /* last: Stop before this one. */
40d049e4
YO
2315 /* data: string data about the pattern */
2316 /* stopparen: treat close N as END */
2317 /* recursed: which subroutines have we recursed into */
2318 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2319{
97aff369 2320 dVAR;
c277df42
IZ
2321 I32 min = 0, pars = 0, code;
2322 regnode *scan = *scanp, *next;
2323 I32 delta = 0;
2324 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2325 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2326 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2327 scan_data_t data_fake;
a3621e74 2328 SV *re_trie_maxbuff = NULL;
786e8c11 2329 regnode *first_non_open = scan;
e2e6a0f1 2330 I32 stopmin = I32_MAX;
b515a41d
YO
2331 scan_frame last_frame= { last, NULL, NULL, stopparen };
2332 scan_frame *frame=&last_frame;
2333
a3621e74 2334 GET_RE_DEBUG_FLAGS_DECL;
b515a41d 2335
13a24bad 2336#ifdef DEBUGGING
40d049e4 2337 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2338#endif
40d049e4 2339
786e8c11 2340 if ( depth == 0 ) {
40d049e4 2341 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2342 first_non_open=regnext(first_non_open);
2343 }
2344
b515a41d
YO
2345 while (frame) {
2346
2347 DEBUG_PEEP("FBEG",scan,depth);
2348 while ( scan && OP(scan) != END && scan < frame->last ) {
2349 /* Peephole optimizer: */
2350 DEBUG_STUDYDATA(data,depth);
2351 DEBUG_PEEP("Peep",scan,depth);
2352 JOIN_EXACT(scan,&min,0);
2353
2354 /* Follow the next-chain of the current node and optimize
2355 away all the NOTHINGs from it. */
2356 if (OP(scan) != CURLYX) {
2357 const int max = (reg_off_by_arg[OP(scan)]
2358 ? I32_MAX
2359 /* I32 may be smaller than U16 on CRAYs! */
2360 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2361 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2362 int noff;
2363 regnode *n = scan;
2364
2365 /* Skip NOTHING and LONGJMP. */
2366 while ((n = regnext(n))
2367 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2368 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2369 && off + noff < max)
2370 off += noff;
2371 if (reg_off_by_arg[OP(scan)])
2372 ARG(scan) = off;
2373 else
2374 NEXT_OFF(scan) = off;
2375 }
b81d288d 2376
b515a41d
YO
2377 /* The principal pseudo-switch. Cannot be a switch, since we
2378 look into several different things. */
2379 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2380 || OP(scan) == IFTHEN) {
2381 next = regnext(scan);
2382 code = OP(scan);
2383 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2384
2385 if (OP(next) == code || code == IFTHEN) {
2386 /* NOTE - There is similar code to this block below for handling
2387 TRIE nodes on a re-study. If you change stuff here check there
2388 too. */
2389 I32 max1 = 0, min1 = I32_MAX, num = 0;
2390 struct regnode_charclass_class accum;
2391 regnode * const startbranch=scan;
2392
2393 if (flags & SCF_DO_SUBSTR)
2394 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2395 if (flags & SCF_DO_STCLASS)
2396 cl_init_zero(pRExC_state, &accum);
a3621e74 2397
b515a41d
YO
2398 while (OP(scan) == code) {
2399 I32 deltanext, minnext, f = 0, fake;
2400 struct regnode_charclass_class this_class;
c277df42 2401
b515a41d
YO
2402 num++;
2403 data_fake.flags = 0;
2404 if (data) {
2405 data_fake.whilem_c = data->whilem_c;
2406 data_fake.last_closep = data->last_closep;
2407 }
2408 else
2409 data_fake.last_closep = &fake;
2410 next = regnext(scan);
c277df42 2411 scan = NEXTOPER(scan);
b515a41d
YO
2412 if (code != BRANCH)
2413 scan = NEXTOPER(scan);
2414 if (flags & SCF_DO_STCLASS) {
2415 cl_init(pRExC_state, &this_class);
2416 data_fake.start_class = &this_class;
2417 f = SCF_DO_STCLASS_AND;
2418 }
2419 if (flags & SCF_WHILEM_VISITED_POS)
2420 f |= SCF_WHILEM_VISITED_POS;
2421
2422 /* we suppose the run is continuous, last=next...*/
2423 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2424 next, &data_fake,
2425 stopparen, recursed, NULL, f,depth+1);
2426 if (min1 > minnext)
2427 min1 = minnext;
2428 if (max1 < minnext + deltanext)
2429 max1 = minnext + deltanext;
2430 if (deltanext == I32_MAX)
2431 is_inf = is_inf_internal = 1;
2432 scan = next;
2433 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2434 pars++;
2435 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2436 if ( stopmin > minnext)
2437 stopmin = min + min1;
2438 flags &= ~SCF_DO_SUBSTR;
2439 if (data)
2440 data->flags |= SCF_SEEN_ACCEPT;
2441 }
2442 if (data) {
2443 if (data_fake.flags & SF_HAS_EVAL)
2444 data->flags |= SF_HAS_EVAL;
2445 data->whilem_c = data_fake.whilem_c;
2446 }
2447 if (flags & SCF_DO_STCLASS)
2448 cl_or(pRExC_state, &accum, &this_class);
3dab1dad 2449 }
b515a41d
YO
2450 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2451 min1 = 0;
2452 if (flags & SCF_DO_SUBSTR) {
2453 data->pos_min += min1;
2454 data->pos_delta += max1 - min1;
2455 if (max1 != min1 || is_inf)
2456 data->longest = &(data->longest_float);
653099ff 2457 }
b515a41d
YO
2458 min += min1;
2459 delta += max1 - min1;
2460 if (flags & SCF_DO_STCLASS_OR) {
2461 cl_or(pRExC_state, data->start_class, &accum);
2462 if (min1) {
2463 cl_and(data->start_class, and_withp);
2464 flags &= ~SCF_DO_STCLASS;
2465 }
de0c8cb8 2466 }
b515a41d
YO
2467 else if (flags & SCF_DO_STCLASS_AND) {
2468 if (min1) {
2469 cl_and(data->start_class, &accum);
2470 flags &= ~SCF_DO_STCLASS;
2471 }
2472 else {
2473 /* Switch to OR mode: cache the old value of
2474 * data->start_class */
2475 INIT_AND_WITHP;
2476 StructCopy(data->start_class, and_withp,
2477 struct regnode_charclass_class);
2478 flags &= ~SCF_DO_STCLASS_AND;
2479 StructCopy(&accum, data->start_class,
2480 struct regnode_charclass_class);
2481 flags |= SCF_DO_STCLASS_OR;
2482 data->start_class->flags |= ANYOF_EOS;
2483 }
de0c8cb8 2484 }
a3621e74 2485
b515a41d
YO
2486 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2487 /* demq.
a3621e74 2488
b515a41d
YO
2489 Assuming this was/is a branch we are dealing with: 'scan' now
2490 points at the item that follows the branch sequence, whatever
2491 it is. We now start at the beginning of the sequence and look
2492 for subsequences of
a3621e74 2493
b515a41d
YO
2494 BRANCH->EXACT=>x1
2495 BRANCH->EXACT=>x2
2496 tail
a3621e74 2497
b515a41d 2498 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2499
b515a41d
YO
2500 If we can find such a subseqence we need to turn the first
2501 element into a trie and then add the subsequent branch exact
2502 strings to the trie.
a3621e74 2503
b515a41d 2504 We have two cases
a3621e74 2505
b515a41d 2506 1. patterns where the whole set of branch can be converted.
a3621e74 2507
b515a41d 2508 2. patterns where only a subset can be converted.
a3621e74 2509
b515a41d
YO
2510 In case 1 we can replace the whole set with a single regop
2511 for the trie. In case 2 we need to keep the start and end
2512 branchs so
a3621e74 2513
b515a41d
YO
2514 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2515 becomes BRANCH TRIE; BRANCH X;
a3621e74 2516
b515a41d
YO
2517 There is an additional case, that being where there is a
2518 common prefix, which gets split out into an EXACT like node
2519 preceding the TRIE node.
786e8c11 2520
b515a41d
YO
2521 If x(1..n)==tail then we can do a simple trie, if not we make
2522 a "jump" trie, such that when we match the appropriate word
2523 we "jump" to the appopriate tail node. Essentailly we turn
2524 a nested if into a case structure of sorts.
a3621e74 2525
b515a41d
YO
2526 */
2527
2528 int made=0;
2529 if (!re_trie_maxbuff) {
2530 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2531 if (!SvIOK(re_trie_maxbuff))
2532 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2533 }
2534 if ( SvIV(re_trie_maxbuff)>=0 ) {
2535 regnode *cur;
2536 regnode *first = (regnode *)NULL;
2537 regnode *last = (regnode *)NULL;
2538 regnode *tail = scan;
2539 U8 optype = 0;
2540 U32 count=0;
a3621e74
YO
2541
2542#ifdef DEBUGGING
b515a41d 2543 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2544#endif
b515a41d
YO
2545 /* var tail is used because there may be a TAIL
2546 regop in the way. Ie, the exacts will point to the
2547 thing following the TAIL, but the last branch will
2548 point at the TAIL. So we advance tail. If we
2549 have nested (?:) we may have to move through several
2550 tails.
2551 */
2552
2553 while ( OP( tail ) == TAIL ) {
2554 /* this is the TAIL generated by (?:) */
2555 tail = regnext( tail );
2556 }
a3621e74 2557
a3621e74 2558
b515a41d
YO
2559 DEBUG_OPTIMISE_r({
2560 regprop(RExC_rx, mysv, tail );
2561 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2562 (int)depth * 2 + 2, "",
2563 "Looking for TRIE'able sequences. Tail node is: ",
2564 SvPV_nolen_const( mysv )
2565 );
2566 });
2567
2568 /*
2569
2570 step through the branches, cur represents each
2571 branch, noper is the first thing to be matched
2572 as part of that branch and noper_next is the
2573 regnext() of that node. if noper is an EXACT
2574 and noper_next is the same as scan (our current
2575 position in the regex) then the EXACT branch is
2576 a possible optimization target. Once we have
2577 two or more consequetive such branches we can
2578 create a trie of the EXACT's contents and stich
2579 it in place. If the sequence represents all of
2580 the branches we eliminate the whole thing and
2581 replace it with a single TRIE. If it is a
2582 subsequence then we need to stitch it in. This
2583 means the first branch has to remain, and needs
2584 to be repointed at the item on the branch chain
2585 following the last branch optimized. This could
2586 be either a BRANCH, in which case the
2587 subsequence is internal, or it could be the
2588 item following the branch sequence in which
2589 case the subsequence is at the end.
a3621e74 2590
b515a41d 2591*/
a3621e74 2592
b515a41d
YO
2593 /* dont use tail as the end marker for this traverse */
2594 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2595 regnode * const noper = NEXTOPER( cur );
2596#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2597 regnode * const noper_next = regnext( noper );
2598#endif
2599
2600 DEBUG_OPTIMISE_r({
2601 regprop(RExC_rx, mysv, cur);
2602 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2603 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2604
2605 regprop(RExC_rx, mysv, noper);
2606 PerlIO_printf( Perl_debug_log, " -> %s",
2607 SvPV_nolen_const(mysv));
2608
2609 if ( noper_next ) {
2610 regprop(RExC_rx, mysv, noper_next );
2611 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2612 SvPV_nolen_const(mysv));
2613 }
2614 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2615 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2616 });
2617 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2618 : PL_regkind[ OP( noper ) ] == EXACT )
2619 || OP(noper) == NOTHING )
786e8c11 2620#ifdef NOJUMPTRIE
b515a41d 2621 && noper_next == tail
786e8c11 2622#endif
b515a41d
YO
2623 && count < U16_MAX)
2624 {
2625 count++;
2626 if ( !first || optype == NOTHING ) {
2627 if (!first) first = cur;
2628 optype = OP( noper );
2629 } else {
2630 last = cur;
2631 }
2632 } else {
2633 if ( last ) {
2634 make_trie( pRExC_state,
2635 startbranch, first, cur, tail, count,
2636 optype, depth+1 );
2637 }
2638 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2639#ifdef NOJUMPTRIE
b515a41d 2640 && noper_next == tail
786e8c11 2641#endif
b515a41d
YO
2642 ) {
2643 count = 1;
2644 first = cur;
2645 optype = OP( noper );
2646 } else {
2647 count = 0;
2648 first = NULL;
2649 optype = 0;
2650 }
2651 last = NULL;
2652 }
2653 }
2654 DEBUG_OPTIMISE_r({
2655 regprop(RExC_rx, mysv, cur);
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2658 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2659
2660 });
2661 if ( last ) {
2662 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2663#ifdef TRIE_STUDY_OPT
b515a41d
YO
2664 if ( ((made == MADE_EXACT_TRIE &&
2665 startbranch == first)
2666 || ( first_non_open == first )) &&
2667 depth==0 ) {
2668 flags |= SCF_TRIE_RESTUDY;
2669 if ( startbranch == first
2670 && scan == tail )
2671 {
2672 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2673 }
2674 }
3dab1dad 2675#endif
b515a41d
YO
2676 }
2677 }
2678
2679 } /* do trie */
2680
c277df42 2681 }
b515a41d
YO
2682 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2683 scan = NEXTOPER(NEXTOPER(scan));
2684 } else /* single branch is optimized. */
2685 scan = NEXTOPER(scan);
2686 continue;
2687 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2688 scan_frame *newframe = NULL;
2689 I32 paren;
2690 regnode *start;
2691 regnode *end;
2692
2693 if (OP(scan) != SUSPEND) {
2694 /* set the pointer */
2695 if (OP(scan) == GOSUB) {
2696 paren = ARG(scan);
2697 RExC_recurse[ARG2L(scan)] = scan;
2698 start = RExC_open_parens[paren-1];
2699 end = RExC_close_parens[paren-1];
2700 } else {
2701 paren = 0;
2702 start = RExC_rx->program + 1;
2703 end = RExC_opend;
2704 }
2705 if (!recursed) {
2706 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2707 SAVEFREEPV(recursed);
2708 }
2709 if (!PAREN_TEST(recursed,paren+1)) {
2710 PAREN_SET(recursed,paren+1);
2711 Newx(newframe,1,scan_frame);
2712 } else {
2713 if (flags & SCF_DO_SUBSTR) {
2714 scan_commit(pRExC_state,data,minlenp);
2715 data->longest = &(data->longest_float);
2716 }
2717 is_inf = is_inf_internal = 1;
2718 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2719 cl_anything(pRExC_state, data->start_class);
2720 flags &= ~SCF_DO_STCLASS;
2721 }
2722 } else {
2723 Newx(newframe,1,scan_frame);
2724 paren = stopparen;
2725 start = scan+2;
2726 end = regnext(scan);
0eda9292 2727 }
b515a41d
YO
2728 if (newframe) {
2729 assert(start);
2730 assert(end);
2731 SAVEFREEPV(newframe);
2732 newframe->next = regnext(scan);
2733 newframe->last = end;
2734 newframe->stop = stopparen;
2735 newframe->prev = frame;
2736 frame = newframe;
2737 scan = start;
2738 stopparen = paren;
2739 continue;
2740 }
653099ff 2741 }
b515a41d
YO
2742 else if (OP(scan) == EXACT) {
2743 I32 l = STR_LEN(scan);
2744 UV uc;
2745 if (UTF) {
2746 const U8 * const s = (U8*)STRING(scan);
2747 l = utf8_length(s, s + l);
2748 uc = utf8_to_uvchr(s, NULL);
2749 } else {
2750 uc = *((U8*)STRING(scan));
2751 }
2752 min += l;
2753 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2754 /* The code below prefers earlier match for fixed
2755 offset, later match for variable offset. */
2756 if (data->last_end == -1) { /* Update the start info. */
2757 data->last_start_min = data->pos_min;
2758 data->last_start_max = is_inf
2759 ? I32_MAX : data->pos_min + data->pos_delta;
2760 }
2761 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2762 if (UTF)
2763 SvUTF8_on(data->last_found);
2764 {
2765 SV * const sv = data->last_found;
2766 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2767 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2768 if (mg && mg->mg_len >= 0)
2769 mg->mg_len += utf8_length((U8*)STRING(scan),
2770 (U8*)STRING(scan)+STR_LEN(scan));
2771 }
2772 data->last_end = data->pos_min + l;
2773 data->pos_min += l; /* As in the first entry. */
2774 data->flags &= ~SF_BEFORE_EOL;
2775 }
2776 if (flags & SCF_DO_STCLASS_AND) {
2777 /* Check whether it is compatible with what we know already! */
2778 int compat = 1;
2779
2780 if (uc >= 0x100 ||
2781 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2782 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2783 && (!(data->start_class->flags & ANYOF_FOLD)
2784 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2785 )
2786 compat = 0;
2787 ANYOF_CLASS_ZERO(data->start_class);
2788 ANYOF_BITMAP_ZERO(data->start_class);
2789 if (compat)
2790 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2791 data->start_class->flags &= ~ANYOF_EOS;
b515a41d
YO
2792 if (uc < 0x100)
2793 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff 2794 }
b515a41d
YO
2795 else if (flags & SCF_DO_STCLASS_OR) {
2796 /* false positive possible if the class is case-folded */
1aa99e6b
IH
2797 if (uc < 0x100)
2798 ANYOF_BITMAP_SET(data->start_class, uc);
b515a41d
YO
2799 else
2800 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2801 data->start_class->flags &= ~ANYOF_EOS;
b515a41d 2802 cl_and(data->start_class, and_withp);
653099ff 2803 }
b515a41d 2804 flags &= ~SCF_DO_STCLASS;
653099ff 2805 }
b515a41d
YO
2806 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2807 I32 l = STR_LEN(scan);
2808 UV uc = *((U8*)STRING(scan));
2809
2810 /* Search for fixed substrings supports EXACT only. */
c277df42 2811 if (flags & SCF_DO_SUBSTR) {
b515a41d
YO
2812 assert(data);
2813 scan_commit(pRExC_state, data, minlenp);
c277df42 2814 }
b515a41d
YO
2815 if (UTF) {
2816 const U8 * const s = (U8 *)STRING(scan);
2817 l = utf8_length(s, s + l);
2818 uc = utf8_to_uvchr(s, NULL);
653099ff 2819 }
b515a41d
YO
2820 min += l;
2821 if (flags & SCF_DO_SUBSTR)
2822 data->pos_min += l;
2823 if (flags & SCF_DO_STCLASS_AND) {
2824 /* Check whether it is compatible with what we know already! */
2825 int compat = 1;
2826
2827 if (uc >= 0x100 ||
2828 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2829 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2830 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2831 compat = 0;
2832 ANYOF_CLASS_ZERO(data->start_class);
2833 ANYOF_BITMAP_ZERO(data->start_class);
2834 if (compat) {
2835 ANYOF_BITMAP_SET(data->start_class, uc);
2836 data->start_class->flags &= ~ANYOF_EOS;
2837 data->start_class->flags |= ANYOF_FOLD;
2838 if (OP(scan) == EXACTFL)
2839 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2840 }
653099ff 2841 }
b515a41d
YO
2842 else if (flags & SCF_DO_STCLASS_OR) {
2843 if (data->start_class->flags & ANYOF_FOLD) {
2844 /* false positive possible if the class is case-folded.
2845 Assume that the locale settings are the same... */
2846 if (uc < 0x100)
2847 ANYOF_BITMAP_SET(data->start_class, uc);
2848 data->start_class->flags &= ~ANYOF_EOS;
2849 }
2850 cl_and(data->start_class, and_withp);
b45f050a 2851 }
b515a41d
YO
2852 flags &= ~SCF_DO_STCLASS;
2853 }
2854 else if (strchr((const char*)PL_varies,OP(scan))) {
2855 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2856 I32 f = flags, pos_before = 0;
2857 regnode * const oscan = scan;
2858 struct regnode_charclass_class this_class;
2859 struct regnode_charclass_class *oclass = NULL;
2860 I32 next_is_eval = 0;
b45f050a 2861
b515a41d
YO
2862 switch (PL_regkind[OP(scan)]) {
2863 case WHILEM: /* End of (?:...)* . */
2864 scan = NEXTOPER(scan);
2865 goto finish;
2866 case PLUS:
2867 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2868 next = NEXTOPER(scan);
2869 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2870 mincount = 1;
2871 maxcount = REG_INFTY;
2872 next = regnext(scan);
2873 scan = NEXTOPER(scan);
2874 goto do_curly;
2875 }
2876 }
2877 if (flags & SCF_DO_SUBSTR)
2878 data->pos_min++;
2879 min++;
2880 /* Fall through. */
2881 case STAR:
2882 if (flags & SCF_DO_STCLASS) {
2883 mincount = 0;
2884 maxcount = REG_INFTY;
2885 next = regnext(scan);
2886 scan = NEXTOPER(scan);
2887 goto do_curly;
2888 }
2889 is_inf = is_inf_internal = 1;
2890 scan = regnext(scan);
2891 if (flags & SCF_DO_SUBSTR) {
2892 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2893 data->longest = &(data->longest_float);
2894 }
2895 goto optimize_curly_tail;
2896 case CURLY:
2897 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2898 && (scan->flags == stopparen))
2899 {
2900 mincount = 1;
2901 maxcount = 1;
2902 } else {
2903 mincount = ARG1(scan);
2904 maxcount = ARG2(scan);
2905 }
2906 next = regnext(scan);
2907 if (OP(scan) == CURLYX) {
2908 I32 lp = (data ? *(data->last_closep) : 0);
2909 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2910 }
2911 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2912 next_is_eval = (OP(scan) == EVAL);
2913do_curly:
2914 if (flags & SCF_DO_SUBSTR) {
2915 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2916 pos_before = data->pos_min;
2917 }
2918 if (data) {
2919 fl = data->flags;
2920 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2921 if (is_inf)
2922 data->flags |= SF_IS_INF;
2923 }
2924 if (flags & SCF_DO_STCLASS) {
2925 cl_init(pRExC_state, &this_class);
2926 oclass = data->start_class;
2927 data->start_class = &this_class;
2928 f |= SCF_DO_STCLASS_AND;
2929 f &= ~SCF_DO_STCLASS_OR;
2930 }
2931 /* These are the cases when once a subexpression
2932 fails at a particular position, it cannot succeed
2933 even after backtracking at the enclosing scope.
2934
2935 XXXX what if minimal match and we are at the
2936 initial run of {n,m}? */
2937 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2938 f &= ~SCF_WHILEM_VISITED_POS;
2939
2940 /* This will finish on WHILEM, setting scan, or on NULL: */
2941 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2942 last, data, stopparen, recursed, NULL,
2943 (mincount == 0
2944 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2945
2946 if (flags & SCF_DO_STCLASS)
2947 data->start_class = oclass;
2948 if (mincount == 0 || minnext == 0) {
2949 if (flags & SCF_DO_STCLASS_OR) {
2950 cl_or(pRExC_state, data->start_class, &this_class);
2951 }
2952 else if (flags & SCF_DO_STCLASS_AND) {
2953 /* Switch to OR mode: cache the old value of
2954 * data->start_class */
2955 INIT_AND_WITHP;
2956 StructCopy(data->start_class, and_withp,
2957 struct regnode_charclass_class);
2958 flags &= ~SCF_DO_STCLASS_AND;
2959 StructCopy(&this_class, data->start_class,
2960 struct regnode_charclass_class);
2961 flags |= SCF_DO_STCLASS_OR;
2962 data->start_class->flags |= ANYOF_EOS;
2963 }
2964 } else { /* Non-zero len */
2965 if (flags & SCF_DO_STCLASS_OR) {
2966 cl_or(pRExC_state, data->start_class, &this_class);
2967 cl_and(data->start_class, and_withp);
2968 }
2969 else if (flags & SCF_DO_STCLASS_AND)
2970 cl_and(data->start_class, &this_class);
2971 flags &= ~SCF_DO_STCLASS;
2972 }
2973 if (!scan) /* It was not CURLYX, but CURLY. */
2974 scan = next;
2975 if ( /* ? quantifier ok, except for (?{ ... }) */
2976 (next_is_eval || !(mincount == 0 && maxcount == 1))
2977 && (minnext == 0) && (deltanext == 0)
2978 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2979 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2980 && ckWARN(WARN_REGEXP))
2981 {
2982 vWARN(RExC_parse,
2983 "Quantifier unexpected on zero-length expression");
2984 }
2985
2986 min += minnext * mincount;
2987 is_inf_internal |= ((maxcount == REG_INFTY
2988 && (minnext + deltanext) > 0)
2989 || deltanext == I32_MAX);
2990 is_inf |= is_inf_internal;
2991 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2992
2993 /* Try powerful optimization CURLYX => CURLYN. */
2994 if ( OP(oscan) == CURLYX && data
2995 && data->flags & SF_IN_PAR
2996 && !(data->flags & SF_HAS_EVAL)
2997 && !deltanext && minnext == 1 ) {
2998 /* Try to optimize to CURLYN. */
2999 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3000 regnode * const nxt1 = nxt;
497b47a8 3001#ifdef DEBUGGING
b515a41d 3002 regnode *nxt2;
497b47a8 3003#endif
c277df42 3004
b515a41d
YO
3005 /* Skip open. */
3006 nxt = regnext(nxt);
3007 if (!strchr((const char*)PL_simple,OP(nxt))
3008 && !(PL_regkind[OP(nxt)] == EXACT
3009 && STR_LEN(nxt) == 1))
3010 goto nogo;
497b47a8 3011#ifdef DEBUGGING
b515a41d 3012 nxt2 = nxt;
497b47a8 3013#endif
b515a41d
YO
3014 nxt = regnext(nxt);
3015 if (OP(nxt) != CLOSE)
3016 goto nogo;
3017 if (RExC_open_parens) {
3018 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3019 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3020 }
3021 /* Now we know that nxt2 is the only contents: */
3022 oscan->flags = (U8)ARG(nxt);
3023 OP(oscan) = CURLYN;
3024 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3025
c277df42 3026#ifdef DEBUGGING
b515a41d
YO
3027 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3028 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3029 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3030 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3031 OP(nxt + 1) = OPTIMIZED; /* was count. */
3032 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3033#endif
40d049e4 3034 }
b515a41d
YO
3035nogo:
3036
3037 /* Try optimization CURLYX => CURLYM. */
3038 if ( OP(oscan) == CURLYX && data
3039 && !(data->flags & SF_HAS_PAR)
3040 && !(data->flags & SF_HAS_EVAL)
3041 && !deltanext /* atom is fixed width */
3042 && minnext != 0 /* CURLYM can't handle zero width */
3043 ) {
3044 /* XXXX How to optimize if data == 0? */
3045 /* Optimize to a simpler form. */
3046 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3047 regnode *nxt2;
3048
3049 OP(oscan) = CURLYM;
3050 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3051 && (OP(nxt2) != WHILEM))
3052 nxt = nxt2;
3053 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3054 /* Need to optimize away parenths. */
3055 if (data->flags & SF_IN_PAR) {
3056 /* Set the parenth number. */
3057 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3058
3059 if (OP(nxt) != CLOSE)
3060 FAIL("Panic opt close");
3061 oscan->flags = (U8)ARG(nxt);
3062 if (RExC_open_parens) {
3063 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3064 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3065 }
3066 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3067 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3068
c277df42 3069#ifdef DEBUGGING
b515a41d
YO
3070 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3071 OP(nxt + 1) = OPTIMIZED; /* was count. */
3072 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3073 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3074#endif
c277df42 3075#if 0
b515a41d
YO
3076 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3077 regnode *nnxt = regnext(nxt1);
3078
3079 if (nnxt == nxt) {
3080 if (reg_off_by_arg[OP(nxt1)])
3081 ARG_SET(nxt1, nxt2 - nxt1);
3082 else if (nxt2 - nxt1 < U16_MAX)
3083 NEXT_OFF(nxt1) = nxt2 - nxt1;
3084 else
3085 OP(nxt) = NOTHING; /* Cannot beautify */
3086 }
3087 nxt1 = nnxt;
3088 }
3089#endif
3090 /* Optimize again: */
3091 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3092 NULL, stopparen, recursed, NULL, 0,depth+1);
c277df42 3093 }
b515a41d
YO
3094 else
3095 oscan->flags = 0;
c277df42 3096 }
b515a41d
YO
3097 else if ((OP(oscan) == CURLYX)
3098 && (flags & SCF_WHILEM_VISITED_POS)
3099 /* See the comment on a similar expression above.
3100 However, this time it not a subexpression
3101 we care about, but the expression itself. */
3102 && (maxcount == REG_INFTY)
3103 && data && ++data->whilem_c < 16) {
3104 /* This stays as CURLYX, we can put the count/of pair. */
3105 /* Find WHILEM (as in regexec.c) */
3106 regnode *nxt = oscan + NEXT_OFF(oscan);
3107
3108 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3109 nxt += ARG(nxt);
3110 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3111 | (RExC_whilem_seen << 4)); /* On WHILEM */
3112 }
3113 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3114 pars++;
3115 if (flags & SCF_DO_SUBSTR) {
3116 SV *last_str = NULL;
3117 int counted = mincount != 0;
c277df42 3118
b515a41d 3119 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c 3120#if defined(SPARC64_GCC_WORKAROUND)
b515a41d
YO
3121 I32 b = 0;
3122 STRLEN l = 0;
3123 const char *s = NULL;
3124 I32 old = 0;
5d1c421c 3125
b515a41d
YO
3126 if (pos_before >= data->last_start_min)
3127 b = pos_before;
3128 else
3129 b = data->last_start_min;
5d1c421c 3130
b515a41d
YO
3131 l = 0;
3132 s = SvPV_const(data->last_found, l);
3133 old = b - data->last_start_min;
5d1c421c
JH
3134
3135#else
b515a41d
YO
3136 I32 b = pos_before >= data->last_start_min
3137 ? pos_before : data->last_start_min;
3138 STRLEN l;
3139 const char * const s = SvPV_const(data->last_found, l);
3140 I32 old = b - data->last_start_min;
5d1c421c 3141#endif
a0ed51b3 3142
b515a41d
YO
3143 if (UTF)
3144 old = utf8_hop((U8*)s, old) - (U8*)s;
3145
3146 l -= old;
3147 /* Get the added string: */
3148 last_str = newSVpvn(s + old, l);
3149 if (UTF)
3150 SvUTF8_on(last_str);
3151 if (deltanext == 0 && pos_before == b) {
3152 /* What was added is a constant string */
3153 if (mincount > 1) {
3154 SvGROW(last_str, (mincount * l) + 1);
3155 repeatcpy(SvPVX(last_str) + l,
3156 SvPVX_const(last_str), l, mincount - 1);
3157 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3158 /* Add additional parts. */
3159 SvCUR_set(data->last_found,
3160 SvCUR(data->last_found) - l);
3161 sv_catsv(data->last_found, last_str);
3162 {
3163 SV * sv = data->last_found;
3164 MAGIC *mg =
3165 SvUTF8(sv) && SvMAGICAL(sv) ?
3166 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3167 if (mg && mg->mg_len >= 0)
3168 mg->mg_len += CHR_SVLEN(last_str);
3169 }
3170 data->last_end += l * (mincount - 1);
3171 }
3172 } else {
3173 /* start offset must point into the last copy */
3174 data->last_start_min += minnext * (mincount - 1);
3175 data->last_start_max += is_inf ? I32_MAX
3176 : (maxcount - 1) * (minnext + data->pos_delta);
3177 }
3178 }
3179 /* It is counted once already... */
3180 data->pos_min += minnext * (mincount - counted);
3181 data->pos_delta += - counted * deltanext +
3182 (minnext + deltanext) * maxcount - minnext * mincount;
3183 if (mincount != maxcount) {
3184 /* Cannot extend fixed substrings found inside
3185 the group. */
3186 scan_commit(pRExC_state,data,minlenp);
3187 if (mincount && last_str) {
3188 SV * const sv = data->last_found;
3189 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292 3190 mg_find(sv, PERL_MAGIC_utf8) : NULL;
b515a41d
YO
3191
3192 if (mg)
3193 mg->mg_len = -1;
3194 sv_setsv(sv, last_str);
3195 data->last_end = data->pos_min;
3196 data->last_start_min =
3197 data->pos_min - CHR_SVLEN(last_str);
3198 data->last_start_max = is_inf
3199 ? I32_MAX
3200 : data->pos_min + data->pos_delta
3201 - CHR_SVLEN(last_str);
0eda9292 3202 }
b515a41d 3203 data->longest = &(data->longest_float);
c277df42 3204 }
b515a41d 3205 SvREFCNT_dec(last_str);
c277df42 3206 }
b515a41d
YO
3207 if (data && (fl & SF_HAS_EVAL))
3208 data->flags |= SF_HAS_EVAL;
3209optimize_curly_tail:
3210 if (OP(oscan) != CURLYX) {
3211 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3212 && NEXT_OFF(next))
3213 NEXT_OFF(oscan) += NEXT_OFF(next);
c277df42 3214 }
b515a41d
YO
3215 continue;
3216 default: /* REF and CLUMP only? */
3217 if (flags & SCF_DO_SUBSTR) {
3218 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3219 data->longest = &(data->longest_float);
3220 }
3221 is_inf = is_inf_internal = 1;
3222 if (flags & SCF_DO_STCLASS_OR)
3223 cl_anything(pRExC_state, data->start_class);
3224 flags &= ~SCF_DO_STCLASS;
3225 break;
3226 }
3227 }
3228 else if (strchr((const char*)PL_simple,OP(scan))) {
3229 int value = 0;
3230
3231 if (flags & SCF_DO_SUBSTR) {
3232 scan_commit(pRExC_state,data,minlenp);
3233 data->pos_min++;
3234 }
3235 min++;
3236 if (flags & SCF_DO_STCLASS) {
3237 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3238
3239 /* Some of the logic below assumes that switching
3240 locale on will only add false positives. */
3241 switch (PL_regkind[OP(scan)]) {
3242 case SANY:
3243 default:
3244do_default:
3245 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3246 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3247 cl_anything(pRExC_state, data->start_class);
3248 break;
3249 case REG_ANY:
3250 if (OP(scan) == SANY)
3251 goto do_default;
3252 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3253 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3254 || (data->start_class->flags & ANYOF_CLASS));
3255 cl_anything(pRExC_state, data->start_class);
3256 }
3257 if (flags & SCF_DO_STCLASS_AND || !value)
3258 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3259 break;
3260 case ANYOF:
3261 if (flags & SCF_DO_STCLASS_AND)
3262 cl_and(data->start_class,
3263 (struct regnode_charclass_class*)scan);
3264 else
3265 cl_or(pRExC_state, data->start_class,
3266 (struct regnode_charclass_class*)scan);
3267 break;
3268 case ALNUM:
3269 if (flags & SCF_DO_STCLASS_AND) {
3270 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3271 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3272 for (value = 0; value < 256; value++)
3273 if (!isALNUM(value))
3274 ANYOF_BITMAP_CLEAR(data->start_class, value);
3275 }
3276 }
3277 else {
3278 if (data->start_class->flags & ANYOF_LOCALE)
3279 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3280 else {
3281 for (value = 0; value < 256; value++)
3282 if (isALNUM(value))
3283 ANYOF_BITMAP_SET(data->start_class, value);
3284 }
3285 }
3286 break;
3287 case ALNUML:
3288 if (flags & SCF_DO_STCLASS_AND) {
3289 if (data->start_class->flags & ANYOF_LOCALE)
3290 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3291 }
3292 else {
3293 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3294 data->start_class->flags |= ANYOF_LOCALE;
3295 }
3296 break;
3297 case NALNUM:
3298 if (flags & SCF_DO_STCLASS_AND) {
3299 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3300 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3301 for (value = 0; value < 256; value++)
3302 if (isALNUM(value))
3303 ANYOF_BITMAP_CLEAR(data->start_class, value);
3304 }
3305 }
3306 else {
3307 if (data->start_class->flags & ANYOF_LOCALE)
3308 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3309 else {
3310 for (value = 0; value < 256; value++)
3311 if (!isALNUM(value))
3312 ANYOF_BITMAP_SET(data->start_class, value);
3313 }
3314 }
3315 break;
3316 case NALNUML:
3317 if (flags & SCF_DO_STCLASS_AND) {
3318 if (data->start_class->flags & ANYOF_LOCALE)
3319 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3320 }
3321 else {
3322 data->start_class->flags |= ANYOF_LOCALE;
3323 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3324 }
3325 break;
3326 case SPACE:
3327 if (flags & SCF_DO_STCLASS_AND) {
3328 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3329 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3330 for (value = 0; value < 256; value++)
3331 if (!isSPACE(value))
3332 ANYOF_BITMAP_CLEAR(data->start_class, value);
3333 }
3334 }
3335 else {
3336 if (data->start_class->flags & ANYOF_LOCALE)
3337 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3338 else {
3339 for (value = 0; value < 256; value++)
3340 if (isSPACE(value))
3341 ANYOF_BITMAP_SET(data->start_class, value);
3342 }
3343 }
3344 break;
3345 case SPACEL:
3346 if (flags & SCF_DO_STCLASS_AND) {
3347 if (data->start_class->flags & ANYOF_LOCALE)
3348 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3349 }
3350 else {
3351 data->start_class->flags |= ANYOF_LOCALE;
3352 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3353 }
3354 break;
3355 case NSPACE:
3356 if (flags & SCF_DO_STCLASS_AND) {
3357 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3358 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3359 for (value = 0; value < 256; value++)
3360 if (isSPACE(value))
3361 ANYOF_BITMAP_CLEAR(data->start_class, value);
3362 }
3363 }
3364 else {
3365 if (data->start_class->flags & ANYOF_LOCALE)
3366 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3367 else {
3368 for (value = 0; value < 256; value++)
3369 if (!isSPACE(value))
3370 ANYOF_BITMAP_SET(data->start_class, value);
3371 }
3372 }
3373 break;
3374 case NSPACEL:
3375 if (flags & SCF_DO_STCLASS_AND) {
3376 if (data->start_class->flags & ANYOF_LOCALE) {
3377 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3378 for (value = 0; value < 256; value++)
3379 if (!isSPACE(value))
3380 ANYOF_BITMAP_CLEAR(data->start_class, value);
3381 }
3382 }
3383 else {
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3386 }
3387 break;
3388 case DIGIT:
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3391 for (value = 0; value < 256; value++)
3392 if (!isDIGIT(value))
3393 ANYOF_BITMAP_CLEAR(data->start_class, value);
3394 }
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (isDIGIT(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
3402 }
3403 }
3404 break;
3405 case NDIGIT:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3408 for (value = 0; value < 256; value++)
3409 if (isDIGIT(value))
3410 ANYOF_BITMAP_CLEAR(data->start_class, value);
3411 }
3412 else {
3413 if (data->start_class->flags & ANYOF_LOCALE)
3414 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3415 else {
3416 for (value = 0; value < 256; value++)
3417 if (!isDIGIT(value))
3418 ANYOF_BITMAP_SET(data->start_class, value);
3419 }
3420 }
3421 break;
c277df42 3422 }
b515a41d
YO
3423 if (flags & SCF_DO_STCLASS_OR)
3424 cl_and(data->start_class, and_withp);
3425 flags &= ~SCF_DO_STCLASS;
c277df42 3426 }
c277df42 3427 }
b515a41d
YO
3428 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3429 data->flags |= (OP(scan) == MEOL
3430 ? SF_BEFORE_MEOL
3431 : SF_BEFORE_SEOL);
c277df42 3432 }
b515a41d
YO
3433 else if ( PL_regkind[OP(scan)] == BRANCHJ
3434 /* Lookbehind, or need to calculate parens/evals/stclass: */
3435 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3436 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3437 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3438 || OP(scan) == UNLESSM )
3439 {
3440 /* Negative Lookahead/lookbehind
3441 In this case we can't do fixed string optimisation.
3442 */
653099ff 3443
b515a41d
YO
3444 I32 deltanext, minnext, fake = 0;
3445 regnode *nscan;
3446 struct regnode_charclass_class intrnl;
3447 int f = 0;
3448
3449 data_fake.flags = 0;
3450 if (data) {
3451 data_fake.whilem_c = data->whilem_c;
3452 data_fake.last_closep = data->last_closep;
653099ff 3453 }
653099ff 3454 else
b515a41d
YO
3455 data_fake.last_closep = &fake;
3456 if ( flags & SCF_DO_STCLASS && !scan->flags
3457 && OP(scan) == IFMATCH ) { /* Lookahead */
3458 cl_init(pRExC_state, &intrnl);
3459 data_fake.start_class = &intrnl;
3460 f |= SCF_DO_STCLASS_AND;
653099ff 3461 }
b515a41d
YO
3462 if (flags & SCF_WHILEM_VISITED_POS)
3463 f |= SCF_WHILEM_VISITED_POS;
3464 next = regnext(scan);
3465 nscan = NEXTOPER(NEXTOPER(scan));
3466 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3467 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3468 if (scan->flags) {
3469 if (deltanext) {
3470 vFAIL("Variable length lookbehind not implemented");
653099ff 3471 }
b515a41d
YO
3472 else if (minnext > (I32)U8_MAX) {
3473 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
653099ff 3474 }
b515a41d 3475 scan->flags = (U8)minnext;
653099ff 3476 }
b515a41d
YO
3477 if (data) {
3478 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3479 pars++;
3480 if (data_fake.flags & SF_HAS_EVAL)
3481 data->flags |= SF_HAS_EVAL;
3482 data->whilem_c = data_fake.whilem_c;
653099ff 3483 }
b515a41d
YO
3484 if (f & SCF_DO_STCLASS_AND) {
3485 const int was = (data->start_class->flags & ANYOF_EOS);
3486
3487 cl_and(data->start_class, &intrnl);
3488 if (was)
3489 data->start_class->flags |= ANYOF_EOS;
653099ff 3490 }
b515a41d
YO
3491 }
3492#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3493 else {
3494 /* Positive Lookahead/lookbehind
3495 In this case we can do fixed string optimisation,
3496 but we must be careful about it. Note in the case of
3497 lookbehind the positions will be offset by the minimum
3498 length of the pattern, something we won't know about
3499 until after the recurse.
3500 */
3501 I32 deltanext, fake = 0;
3502 regnode *nscan;
3503 struct regnode_charclass_class intrnl;
3504 int f = 0;
3505 /* We use SAVEFREEPV so that when the full compile
3506 is finished perl will clean up the allocated
3507 minlens when its all done. This was we don't
3508 have to worry about freeing them when we know
3509 they wont be used, which would be a pain.
3510 */
3511 I32 *minnextp;
3512 Newx( minnextp, 1, I32 );
3513 SAVEFREEPV(minnextp);
3514
3515 if (data) {
3516 StructCopy(data, &data_fake, scan_data_t);
3517 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3518 f |= SCF_DO_SUBSTR;
3519 if (scan->flags)
3520 scan_commit(pRExC_state, &data_fake,minlenp);
3521 data_fake.last_found=newSVsv(data->last_found);
653099ff
GS
3522 }
3523 }
b515a41d
YO
3524 else
3525 data_fake.last_closep = &fake;
3526 data_fake.flags = 0;
3527 if (is_inf)
3528 data_fake.flags |= SF_IS_INF;
3529 if ( flags & SCF_DO_STCLASS && !scan->flags
3530 && OP(scan) == IFMATCH ) { /* Lookahead */
3531 cl_init(pRExC_state, &intrnl);
3532 data_fake.start_class = &intrnl;
3533 f |= SCF_DO_STCLASS_AND;
653099ff 3534 }
b515a41d
YO
3535 if (flags & SCF_WHILEM_VISITED_POS)
3536 f |= SCF_WHILEM_VISITED_POS;
3537 next = regnext(scan);
3538 nscan = NEXTOPER(NEXTOPER(scan));
3539
3540 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3541 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3542 if (scan->flags) {
3543 if (deltanext) {
3544 vFAIL("Variable length lookbehind not implemented");
653099ff 3545 }
b515a41d
YO
3546 else if (*minnextp > (I32)U8_MAX) {
3547 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
653099ff 3548 }
b515a41d 3549 scan->flags = (U8)*minnextp;
653099ff 3550 }
b515a41d
YO
3551
3552 *minnextp += min;
3553
3554 if (f & SCF_DO_STCLASS_AND) {
3555 const int was = (data->start_class->flags & ANYOF_EOS);
3556
3557 cl_and(data->start_class, &intrnl);
3558 if (was)
3559 data->start_class->flags |= ANYOF_EOS;
653099ff 3560 }
b515a41d
YO
3561 if (data) {
3562 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3563 pars++;
3564 if (data_fake.flags & SF_HAS_EVAL)
3565 data->flags |= SF_HAS_EVAL;
3566 data->whilem_c = data_fake.whilem_c;
3567 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3568 if (RExC_rx->minlen<*minnextp)
3569 RExC_rx->minlen=*minnextp;
3570 scan_commit(pRExC_state, &data_fake, minnextp);
3571 SvREFCNT_dec(data_fake.last_found);
3572
3573 if ( data_fake.minlen_fixed != minlenp )
3574 {
3575 data->offset_fixed= data_fake.offset_fixed;
3576 data->minlen_fixed= data_fake.minlen_fixed;
3577 data->lookbehind_fixed+= scan->flags;
3578 }
3579 if ( data_fake.minlen_float != minlenp )
3580 {
3581 data->minlen_float= data_fake.minlen_float;
3582 data->offset_float_min=data_fake.offset_float_min;
3583 data->offset_float_max=data_fake.offset_float_max;
3584 data->lookbehind_float+= scan->flags;
3585 }
653099ff
GS
3586 }
3587 }
1de06328 3588
1de06328 3589
c277df42 3590 }
1de06328 3591#endif
cb434fcc 3592 }
b515a41d
YO
3593 else if (OP(scan) == OPEN) {
3594 if (stopparen != (I32)ARG(scan))
3595 pars++;
40d049e4 3596 }
b515a41d
YO
3597 else if (OP(scan) == CLOSE) {
3598 if (stopparen == (I32)ARG(scan)) {
3599 break;
40d049e4 3600 }
b515a41d
YO
3601 if ((I32)ARG(scan) == is_par) {
3602 next = regnext(scan);
3603
3604 if ( next && (OP(next) != WHILEM) && next < last)
3605 is_par = 0; /* Disable optimization */
40d049e4 3606 }
b515a41d
YO
3607 if (data)
3608 *(data->last_closep) = ARG(scan);
40d049e4 3609 }
b515a41d 3610 else if (OP(scan) == EVAL) {
c277df42
IZ
3611 if (data)
3612 data->flags |= SF_HAS_EVAL;
40d049e4 3613 }
b515a41d
YO
3614 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3615 if (flags & SCF_DO_SUBSTR) {
3616 scan_commit(pRExC_state,data,minlenp);
3617 flags &= ~SCF_DO_SUBSTR;
3618 }
3619 if (data && OP(scan)==ACCEPT) {
3620 data->flags |= SCF_SEEN_ACCEPT;
3621 if (stopmin > min)
3622 stopmin = min;
3623 }
e2e6a0f1 3624 }
b515a41d
YO
3625 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3626 {
0f5d15d6 3627 if (flags & SCF_DO_SUBSTR) {
1de06328 3628 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3629 data->longest = &(data->longest_float);
3630 }
3631 is_inf = is_inf_internal = 1;
653099ff 3632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3633 cl_anything(pRExC_state, data->start_class);
96776eda 3634 flags &= ~SCF_DO_STCLASS;
b515a41d 3635 }
786e8c11 3636#ifdef TRIE_STUDY_OPT
40d049e4 3637#ifdef FULL_TRIE_STUDY
b515a41d
YO
3638 else if (PL_regkind[OP(scan)] == TRIE) {
3639 /* NOTE - There is similar code to this block above for handling
3640 BRANCH nodes on the initial study. If you change stuff here
3641 check there too. */
3642 regnode *trie_node= scan;
3643 regnode *tail= regnext(scan);
3644 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3645 I32 max1 = 0, min1 = I32_MAX;
3646 struct regnode_charclass_class accum;
3647
3648 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3649 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3650 if (flags & SCF_DO_STCLASS)
3651 cl_init_zero(pRExC_state, &accum);
3652
3653 if (!trie->jump) {
3654 min1= trie->minlen;
3655 max1= trie->maxlen;
3656 } else {
3657 const regnode *nextbranch= NULL;
3658 U32 word;
3659
3660 for ( word=1 ; word <= trie->wordcount ; word++)
3661 {
3662 I32 deltanext=0, minnext=0, f = 0, fake;
3663 struct regnode_charclass_class this_class;
3664
3665 data_fake.flags = 0;
3666 if (data) {
3667 data_fake.whilem_c = data->whilem_c;
3668 data_fake.last_closep = data->last_closep;
3669 }
3670 else
3671 data_fake.last_closep = &fake;
3672
3673 if (flags & SCF_DO_STCLASS) {
3674 cl_init(pRExC_state, &this_class);
3675 data_fake.start_class = &this_class;
3676 f = SCF_DO_STCLASS_AND;
3677 }
3678 if (flags & SCF_WHILEM_VISITED_POS)
3679 f |= SCF_WHILEM_VISITED_POS;
3680
3681 if (trie->jump[word]) {
3682 if (!nextbranch)
3683 nextbranch = trie_node + trie->jump[0];
3684 scan= trie_node + trie->jump[word];
3685 /* We go from the jump point to the branch that follows
3686 it. Note this means we need the vestigal unused branches
3687 even though they arent otherwise used.
3688 */
3689 minnext = study_chunk(pRExC_state, &scan, minlenp,
3690 &deltanext, (regnode *)nextbranch, &data_fake,
3691 stopparen, recursed, NULL, f,depth+1);
3692 }
3693 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3694 nextbranch= regnext((regnode*)nextbranch);
3695
3696 if (min1 > (I32)(minnext + trie->minlen))
3697 min1 = minnext + trie->minlen;
3698 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3699 max1 = minnext + deltanext + trie->maxlen;
3700 if (deltanext == I32_MAX)
3701 is_inf = is_inf_internal = 1;
3702
3703 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3704 pars++;
3705 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3706 if ( stopmin > min + min1)
3707 stopmin = min + min1;
3708 flags &= ~SCF_DO_SUBSTR;
3709 if (data)
3710 data->flags |= SCF_SEEN_ACCEPT;
3711 }
3712 if (data) {
3713 if (data_fake.flags & SF_HAS_EVAL)
3714 data->flags |= SF_HAS_EVAL;
3715 data->whilem_c = data_fake.whilem_c;
3716 }
3717 if (flags & SCF_DO_STCLASS)
3718 cl_or(pRExC_state, &accum, &this_class);
3719 }
3720 }
3721 if (flags & SCF_DO_SUBSTR) {
3722 data->pos_min += min1;
3723 data->pos_delta += max1 - min1;
3724 if (max1 != min1 || is_inf)
3725 data->longest = &(data->longest_float);
3726 }
3727 min += min1;
3728 delta += max1 - min1;
3729 if (flags & SCF_DO_STCLASS_OR) {
3730 cl_or(pRExC_state, data->start_class, &accum);
3731 if (min1) {
3732 cl_and(data->start_class, and_withp);
3733 flags &= ~SCF_DO_STCLASS;
3734 }
3735 }
3736 else if (flags & SCF_DO_STCLASS_AND) {
3737 if (min1) {
3738 cl_and(data->start_class, &accum);
3739 flags &= ~SCF_DO_STCLASS;
3740 }
3741 else {
3742 /* Switch to OR mode: cache the old value of
3743 * data->start_class */
3744 INIT_AND_WITHP;
3745 StructCopy(data->start_class, and_withp,
3746 struct regnode_charclass_class);
3747 flags &= ~SCF_DO_STCLASS_AND;
3748 StructCopy(&accum, data->start_class,
3749 struct regnode_charclass_class);
3750 flags |= SCF_DO_STCLASS_OR;
3751 data->start_class->flags |= ANYOF_EOS;
3752 }
3753 }
3754 scan= tail;
3755 continue;
3756 }
786e8c11 3757#else
b515a41d
YO
3758 else if (PL_regkind[OP(scan)] == TRIE) {
3759 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3760 U8*bang=NULL;
3761
3762 min += trie->minlen;
3763 delta += (trie->maxlen - trie->minlen);
3764 flags &= ~SCF_DO_STCLASS; /* xxx */
3765 if (flags & SCF_DO_SUBSTR) {
3766 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3767 data->pos_min += trie->minlen;
3768 data->pos_delta += (trie->maxlen - trie->minlen);
3769 if (trie->maxlen != trie->minlen)
3770 data->longest = &(data->longest_float);
3771 }
3772 if (trie->jump) /* no more substrings -- for now /grr*/
3773 flags &= ~SCF_DO_SUBSTR;
3774 }
786e8c11 3775#endif /* old or new */
b515a41d
YO
3776#endif /* TRIE_STUDY_OPT */
3777 /* Else: zero-length, ignore. */
3778 scan = regnext(scan);
3779 }
3780 DEBUG_PEEP("FEND",scan,depth);
3781 scan = frame->next;
3782 stopparen = frame->stop;
3783 frame = frame->prev;
3784
c277df42
IZ
3785 }
3786
3787 finish:
3788 *scanp = scan;
aca2d497 3789 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3790 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3791 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3792 if (is_par > (I32)U8_MAX)
c277df42
IZ
3793 is_par = 0;
3794 if (is_par && pars==1 && data) {
3795 data->flags |= SF_IN_PAR;
3796 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3797 }
3798 else if (pars && data) {
c277df42
IZ
3799 data->flags |= SF_HAS_PAR;
3800 data->flags &= ~SF_IN_PAR;
3801 }
653099ff 3802 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3803 cl_and(data->start_class, and_withp);
786e8c11
YO
3804 if (flags & SCF_TRIE_RESTUDY)
3805 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3806
3807 DEBUG_STUDYDATA(data,depth);
3808
e2e6a0f1 3809 return min < stopmin ? min : stopmin;
c277df42
IZ
3810}
3811
76e3520e 3812STATIC I32
5f66b61c 3813S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3814{
830247a4 3815 if (RExC_rx->data) {
2eb97020 3816 const U32 count = RExC_rx->data->count;
b81d288d 3817 Renewc(RExC_rx->data,
2eb97020 3818 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3819 char, struct reg_data);
2eb97020 3820 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3821 RExC_rx->data->count += n;
a0ed51b3
LW
3822 }
3823 else {
a02a5408 3824 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3825 char, struct reg_data);
a02a5408 3826 Newx(RExC_rx->data->what, n, U8);
830247a4 3827 RExC_rx->data->count = n;
c277df42 3828 }
830247a4
IZ
3829 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3830 return RExC_rx->data->count - n;
c277df42
IZ
3831}
3832
76234dfb 3833#ifndef PERL_IN_XSUB_RE
d88dccdf 3834void
864dbfa3 3835Perl_reginitcolors(pTHX)
d88dccdf 3836{
97aff369 3837 dVAR;
1df70142 3838 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3839 if (s) {
1df70142
AL
3840 char *t = savepv(s);
3841 int i = 0;
3842 PL_colors[0] = t;
d88dccdf 3843 while (++i < 6) {
1df70142
AL
3844 t = strchr(t, '\t');
3845 if (t) {
3846 *t = '\0';
3847 PL_colors[i] = ++t;
d88dccdf
IZ
3848 }
3849 else
1df70142 3850 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3851 }
3852 } else {
1df70142 3853 int i = 0;
b81d288d 3854 while (i < 6)
06b5626a 3855 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3856 }
3857 PL_colorset = 1;
3858}
76234dfb 3859#endif
8615cb43 3860
07be1b83 3861
786e8c11
YO
3862#ifdef TRIE_STUDY_OPT
3863#define CHECK_RESTUDY_GOTO \
3864 if ( \
3865 (data.flags & SCF_TRIE_RESTUDY) \
3866 && ! restudied++ \
3867 ) goto reStudy
3868#else
3869#define CHECK_RESTUDY_GOTO
3870#endif
f9f4320a 3871
a687059c 3872/*
e50aee73 3873 - pregcomp - compile a regular expression into internal code
a687059c
LW
3874 *
3875 * We can't allocate space until we know how big the compiled form will be,
3876 * but we can't compile it (and thus know how big it is) until we've got a
3877 * place to put the code. So we cheat: we compile it twice, once with code
3878 * generation turned off and size counting turned on, and once "for real".
3879 * This also means that we don't allocate space until we are sure that the
3880 * thing really will compile successfully, and we never have to move the
3881 * code and thus invalidate pointers into it. (Note that it has to be in
3882 * one piece because free() must be able to free it all.) [NB: not true in perl]
3883 *
3884 * Beware that the optimization-preparation code in here knows about some
3885 * of the structure of the compiled regexp. [I'll say.]
3886 */
b9b4dddf
YO
3887
3888