This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Extend the test-reonly make target on windows (Yves Orton)
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 553 PerlIO_printf(Perl_debug_log, \
b515a41d 554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1897 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1898 DEBUG_OPTIMISE_r(
8e11feef
RGS
1899 PerlIO_printf( Perl_debug_log,
1900 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1901 (int)depth * 2 + 2, "",
786e8c11 1902 (UV)state, (UV)idx, ch)
3dab1dad
YO
1903 );
1904 if ( state==1 ) {
1905 OP( convert ) = nodetype;
1906 str=STRING(convert);
1907 STR_LEN(convert)=0;
1908 }
1909 *str++=*ch;
1910 STR_LEN(convert)++;
a3621e74 1911
8e11feef 1912 } else {
f9049ba1 1913#ifdef DEBUGGING
8e11feef
RGS
1914 if (state>1)
1915 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1916#endif
8e11feef
RGS
1917 break;
1918 }
1919 }
3dab1dad 1920 if (str) {
8e11feef 1921 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1922 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1923 trie->startstate = state;
07be1b83
YO
1924 trie->minlen -= (state - 1);
1925 trie->maxlen -= (state - 1);
1926 DEBUG_r({
1927 regnode *fix = convert;
1928 mjd_nodelen++;
1929 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1930 while( ++fix < n ) {
1931 Set_Node_Offset_Length(fix, 0, 0);
1932 }
1933 });
8e11feef
RGS
1934 if (trie->maxlen) {
1935 convert = n;
1936 } else {
3dab1dad 1937 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1938 DEBUG_r(optimize= n);
3dab1dad
YO
1939 }
1940 }
1941 }
a5ca303d
YO
1942 if (!jumper)
1943 jumper = last;
3dab1dad 1944 if ( trie->maxlen ) {
8e11feef
RGS
1945 NEXT_OFF( convert ) = (U16)(tail - convert);
1946 ARG_SET( convert, data_slot );
786e8c11
YO
1947 /* Store the offset to the first unabsorbed branch in
1948 jump[0], which is otherwise unused by the jump logic.
1949 We use this when dumping a trie and during optimisation. */
1950 if (trie->jump)
7f69552c 1951 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1952
786e8c11
YO
1953 /* XXXX */
1954 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1955 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1956 {
1957 OP( convert ) = TRIEC;
1958 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1959 Safefree(trie->bitmap);
1960 trie->bitmap= NULL;
1961 } else
1962 OP( convert ) = TRIE;
a3621e74 1963
3dab1dad
YO
1964 /* store the type in the flags */
1965 convert->flags = nodetype;
a5ca303d
YO
1966 DEBUG_r({
1967 optimize = convert
1968 + NODE_STEP_REGNODE
1969 + regarglen[ OP( convert ) ];
1970 });
1971 /* XXX We really should free up the resource in trie now,
1972 as we won't use them - (which resources?) dmq */
3dab1dad 1973 }
a3621e74 1974 /* needed for dumping*/
e62cc96a 1975 DEBUG_r(if (optimize) {
07be1b83 1976 regnode *opt = convert;
e62cc96a 1977 while ( ++opt < optimize) {
07be1b83
YO
1978 Set_Node_Offset_Length(opt,0,0);
1979 }
786e8c11
YO
1980 /*
1981 Try to clean up some of the debris left after the
1982 optimisation.
a3621e74 1983 */
786e8c11 1984 while( optimize < jumper ) {
07be1b83 1985 mjd_nodelen += Node_Length((optimize));
a3621e74 1986 OP( optimize ) = OPTIMIZED;
07be1b83 1987 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1988 optimize++;
1989 }
07be1b83 1990 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1991 });
1992 } /* end node insert */
07be1b83 1993#ifndef DEBUGGING
6e8b4190 1994 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1995#endif
786e8c11
YO
1996 return trie->jump
1997 ? MADE_JUMP_TRIE
1998 : trie->startstate>1
1999 ? MADE_EXACT_TRIE
2000 : MADE_TRIE;
2001}
2002
2003STATIC void
2004S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2005{
2006/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2007
2008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2009 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2010 ISBN 0-201-10088-6
2011
2012 We find the fail state for each state in the trie, this state is the longest proper
2013 suffix of the current states 'word' that is also a proper prefix of another word in our
2014 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2015 the DFA not to have to restart after its tried and failed a word at a given point, it
2016 simply continues as though it had been matching the other word in the first place.
2017 Consider
2018 'abcdgu'=~/abcdefg|cdgu/
2019 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2020 fail, which would bring use to the state representing 'd' in the second word where we would
2021 try 'g' and succeed, prodceding to match 'cdgu'.
2022 */
2023 /* add a fail transition */
2024 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2025 U32 *q;
2026 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2027 const U32 numstates = trie->statecount;
786e8c11
YO
2028 const U32 ubound = trie->lasttrans + ucharcount;
2029 U32 q_read = 0;
2030 U32 q_write = 0;
2031 U32 charid;
2032 U32 base = trie->states[ 1 ].trans.base;
2033 U32 *fail;
2034 reg_ac_data *aho;
2035 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2036 GET_RE_DEBUG_FLAGS_DECL;
2037#ifndef DEBUGGING
2038 PERL_UNUSED_ARG(depth);
2039#endif
2040
2041
2042 ARG_SET( stclass, data_slot );
2043 Newxz( aho, 1, reg_ac_data );
2044 RExC_rx->data->data[ data_slot ] = (void*)aho;
2045 aho->trie=trie;
2046 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2047 numstates * sizeof(reg_trie_state));
786e8c11
YO
2048 Newxz( q, numstates, U32);
2049 Newxz( aho->fail, numstates, U32 );
2050 aho->refcount = 1;
2051 fail = aho->fail;
2052 /* initialize fail[0..1] to be 1 so that we always have
2053 a valid final fail state */
2054 fail[ 0 ] = fail[ 1 ] = 1;
2055
2056 for ( charid = 0; charid < ucharcount ; charid++ ) {
2057 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2058 if ( newstate ) {
2059 q[ q_write ] = newstate;
2060 /* set to point at the root */
2061 fail[ q[ q_write++ ] ]=1;
2062 }
2063 }
2064 while ( q_read < q_write) {
2065 const U32 cur = q[ q_read++ % numstates ];
2066 base = trie->states[ cur ].trans.base;
2067
2068 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2069 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2070 if (ch_state) {
2071 U32 fail_state = cur;
2072 U32 fail_base;
2073 do {
2074 fail_state = fail[ fail_state ];
2075 fail_base = aho->states[ fail_state ].trans.base;
2076 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2077
2078 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2079 fail[ ch_state ] = fail_state;
2080 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2081 {
2082 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2083 }
2084 q[ q_write++ % numstates] = ch_state;
2085 }
2086 }
2087 }
2088 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2089 when we fail in state 1, this allows us to use the
2090 charclass scan to find a valid start char. This is based on the principle
2091 that theres a good chance the string being searched contains lots of stuff
2092 that cant be a start char.
2093 */
2094 fail[ 0 ] = fail[ 1 ] = 0;
2095 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2096 PerlIO_printf(Perl_debug_log,
2097 "%*sStclass Failtable (%"UVuf" states): 0",
2098 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2099 );
786e8c11
YO
2100 for( q_read=1; q_read<numstates; q_read++ ) {
2101 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2102 }
2103 PerlIO_printf(Perl_debug_log, "\n");
2104 });
2105 Safefree(q);
2106 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2107}
2108
786e8c11 2109
a3621e74 2110/*
5d1c421c
JH
2111 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2112 * These need to be revisited when a newer toolchain becomes available.
2113 */
2114#if defined(__sparc64__) && defined(__GNUC__)
2115# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2116# undef SPARC64_GCC_WORKAROUND
2117# define SPARC64_GCC_WORKAROUND 1
2118# endif
2119#endif
2120
07be1b83 2121#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2122 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2123 SV * const mysv=sv_newmortal(); \
2124 regnode *Next = regnext(scan); \
2125 regprop(RExC_rx, mysv, scan); \
7f69552c 2126 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2127 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2128 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2129 }});
07be1b83 2130
1de06328
YO
2131
2132
2133
2134
07be1b83
YO
2135#define JOIN_EXACT(scan,min,flags) \
2136 if (PL_regkind[OP(scan)] == EXACT) \
2137 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2138
be8e71aa 2139STATIC U32
07be1b83
YO
2140S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2141 /* Merge several consecutive EXACTish nodes into one. */
2142 regnode *n = regnext(scan);
2143 U32 stringok = 1;
2144 regnode *next = scan + NODE_SZ_STR(scan);
2145 U32 merged = 0;
2146 U32 stopnow = 0;
2147#ifdef DEBUGGING
2148 regnode *stop = scan;
72f13be8 2149 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2150#else
d47053eb
RGS
2151 PERL_UNUSED_ARG(depth);
2152#endif
2153#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2154 PERL_UNUSED_ARG(flags);
2155 PERL_UNUSED_ARG(val);
07be1b83 2156#endif
07be1b83
YO
2157 DEBUG_PEEP("join",scan,depth);
2158
2159 /* Skip NOTHING, merge EXACT*. */
2160 while (n &&
2161 ( PL_regkind[OP(n)] == NOTHING ||
2162 (stringok && (OP(n) == OP(scan))))
2163 && NEXT_OFF(n)
2164 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2165
2166 if (OP(n) == TAIL || n > next)
2167 stringok = 0;
2168 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2169 DEBUG_PEEP("skip:",n,depth);
2170 NEXT_OFF(scan) += NEXT_OFF(n);
2171 next = n + NODE_STEP_REGNODE;
2172#ifdef DEBUGGING
2173 if (stringok)
2174 stop = n;
2175#endif
2176 n = regnext(n);
2177 }
2178 else if (stringok) {
786e8c11 2179 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2180 regnode * const nnext = regnext(n);
2181
2182 DEBUG_PEEP("merg",n,depth);
2183
2184 merged++;
2185 if (oldl + STR_LEN(n) > U8_MAX)
2186 break;
2187 NEXT_OFF(scan) += NEXT_OFF(n);
2188 STR_LEN(scan) += STR_LEN(n);
2189 next = n + NODE_SZ_STR(n);
2190 /* Now we can overwrite *n : */
2191 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2192#ifdef DEBUGGING
2193 stop = next - 1;
2194#endif
2195 n = nnext;
2196 if (stopnow) break;
2197 }
2198
d47053eb
RGS
2199#ifdef EXPERIMENTAL_INPLACESCAN
2200 if (flags && !NEXT_OFF(n)) {
2201 DEBUG_PEEP("atch", val, depth);
2202 if (reg_off_by_arg[OP(n)]) {
2203 ARG_SET(n, val - n);
2204 }
2205 else {
2206 NEXT_OFF(n) = val - n;
2207 }
2208 stopnow = 1;
2209 }
07be1b83
YO
2210#endif
2211 }
2212
2213 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2214 /*
2215 Two problematic code points in Unicode casefolding of EXACT nodes:
2216
2217 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2218 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2219
2220 which casefold to
2221
2222 Unicode UTF-8
2223
2224 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2225 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2226
2227 This means that in case-insensitive matching (or "loose matching",
2228 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2229 length of the above casefolded versions) can match a target string
2230 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2231 This would rather mess up the minimum length computation.
2232
2233 What we'll do is to look for the tail four bytes, and then peek
2234 at the preceding two bytes to see whether we need to decrease
2235 the minimum length by four (six minus two).
2236
2237 Thanks to the design of UTF-8, there cannot be false matches:
2238 A sequence of valid UTF-8 bytes cannot be a subsequence of
2239 another valid sequence of UTF-8 bytes.
2240
2241 */
2242 char * const s0 = STRING(scan), *s, *t;
2243 char * const s1 = s0 + STR_LEN(scan) - 1;
2244 char * const s2 = s1 - 4;
e294cc5d
JH
2245#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2246 const char t0[] = "\xaf\x49\xaf\x42";
2247#else
07be1b83 2248 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2249#endif
07be1b83
YO
2250 const char * const t1 = t0 + 3;
2251
2252 for (s = s0 + 2;
2253 s < s2 && (t = ninstr(s, s1, t0, t1));
2254 s = t + 4) {
e294cc5d
JH
2255#ifdef EBCDIC
2256 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2257 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2258#else
07be1b83
YO
2259 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2260 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2261#endif
07be1b83
YO
2262 *min -= 4;
2263 }
2264 }
2265
2266#ifdef DEBUGGING
2267 /* Allow dumping */
2268 n = scan + NODE_SZ_STR(scan);
2269 while (n <= stop) {
2270 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2271 OP(n) = OPTIMIZED;
2272 NEXT_OFF(n) = 0;
2273 }
2274 n++;
2275 }
2276#endif
2277 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2278 return stopnow;
2279}
2280
653099ff
GS
2281/* REx optimizer. Converts nodes into quickier variants "in place".
2282 Finds fixed substrings. */
2283
a0288114 2284/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2285 to the position after last scanned or to NULL. */
2286
40d049e4
YO
2287#define INIT_AND_WITHP \
2288 assert(!and_withp); \
2289 Newx(and_withp,1,struct regnode_charclass_class); \
2290 SAVEFREEPV(and_withp)
07be1b83 2291
b515a41d
YO
2292/* this is a chain of data about sub patterns we are processing that
2293 need to be handled seperately/specially in study_chunk. Its so
2294 we can simulate recursion without losing state. */
2295struct scan_frame;
2296typedef struct scan_frame {
2297 regnode *last; /* last node to process in this frame */
2298 regnode *next; /* next node to process when last is reached */
2299 struct scan_frame *prev; /*previous frame*/
2300 I32 stop; /* what stopparen do we use */
2301} scan_frame;
2302
76e3520e 2303STATIC I32
40d049e4 2304S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2305 I32 *minlenp, I32 *deltap,
40d049e4
YO
2306 regnode *last,
2307 scan_data_t *data,
2308 I32 stopparen,
2309 U8* recursed,
2310 struct regnode_charclass_class *and_withp,
2311 U32 flags, U32 depth)
c277df42
IZ
2312 /* scanp: Start here (read-write). */
2313 /* deltap: Write maxlen-minlen here. */
2314 /* last: Stop before this one. */
40d049e4
YO
2315 /* data: string data about the pattern */
2316 /* stopparen: treat close N as END */
2317 /* recursed: which subroutines have we recursed into */
2318 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2319{
97aff369 2320 dVAR;
c277df42
IZ
2321 I32 min = 0, pars = 0, code;
2322 regnode *scan = *scanp, *next;
2323 I32 delta = 0;
2324 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2325 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2326 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2327 scan_data_t data_fake;
a3621e74 2328 SV *re_trie_maxbuff = NULL;
786e8c11 2329 regnode *first_non_open = scan;
e2e6a0f1 2330 I32 stopmin = I32_MAX;
8aa23a47
YO
2331 scan_frame *frame = NULL;
2332
a3621e74 2333 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2334
13a24bad 2335#ifdef DEBUGGING
40d049e4 2336 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2337#endif
40d049e4 2338
786e8c11 2339 if ( depth == 0 ) {
40d049e4 2340 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2341 first_non_open=regnext(first_non_open);
2342 }
2343
b81d288d 2344
8aa23a47
YO
2345 fake_study_recurse:
2346 while ( scan && OP(scan) != END && scan < last ){
2347 /* Peephole optimizer: */
2348 DEBUG_STUDYDATA(data,depth);
2349 DEBUG_PEEP("Peep",scan,depth);
2350 JOIN_EXACT(scan,&min,0);
2351
2352 /* Follow the next-chain of the current node and optimize
2353 away all the NOTHINGs from it. */
2354 if (OP(scan) != CURLYX) {
2355 const int max = (reg_off_by_arg[OP(scan)]
2356 ? I32_MAX
2357 /* I32 may be smaller than U16 on CRAYs! */
2358 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2359 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2360 int noff;
2361 regnode *n = scan;
2362
2363 /* Skip NOTHING and LONGJMP. */
2364 while ((n = regnext(n))
2365 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2366 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2367 && off + noff < max)
2368 off += noff;
2369 if (reg_off_by_arg[OP(scan)])
2370 ARG(scan) = off;
2371 else
2372 NEXT_OFF(scan) = off;
2373 }
a3621e74 2374
c277df42 2375
8aa23a47
YO
2376
2377 /* The principal pseudo-switch. Cannot be a switch, since we
2378 look into several different things. */
2379 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2380 || OP(scan) == IFTHEN) {
2381 next = regnext(scan);
2382 code = OP(scan);
2383 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2384
2385 if (OP(next) == code || code == IFTHEN) {
2386 /* NOTE - There is similar code to this block below for handling
2387 TRIE nodes on a re-study. If you change stuff here check there
2388 too. */
2389 I32 max1 = 0, min1 = I32_MAX, num = 0;
2390 struct regnode_charclass_class accum;
2391 regnode * const startbranch=scan;
2392
2393 if (flags & SCF_DO_SUBSTR)
2394 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2395 if (flags & SCF_DO_STCLASS)
2396 cl_init_zero(pRExC_state, &accum);
2397
2398 while (OP(scan) == code) {
2399 I32 deltanext, minnext, f = 0, fake;
2400 struct regnode_charclass_class this_class;
2401
2402 num++;
2403 data_fake.flags = 0;
2404 if (data) {
2405 data_fake.whilem_c = data->whilem_c;
2406 data_fake.last_closep = data->last_closep;
2407 }
2408 else
2409 data_fake.last_closep = &fake;
2410 next = regnext(scan);
2411 scan = NEXTOPER(scan);
2412 if (code != BRANCH)
c277df42 2413 scan = NEXTOPER(scan);
8aa23a47
YO
2414 if (flags & SCF_DO_STCLASS) {
2415 cl_init(pRExC_state, &this_class);
2416 data_fake.start_class = &this_class;
2417 f = SCF_DO_STCLASS_AND;
2418 }
2419 if (flags & SCF_WHILEM_VISITED_POS)
2420 f |= SCF_WHILEM_VISITED_POS;
2421
2422 /* we suppose the run is continuous, last=next...*/
2423 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2424 next, &data_fake,
2425 stopparen, recursed, NULL, f,depth+1);
2426 if (min1 > minnext)
2427 min1 = minnext;
2428 if (max1 < minnext + deltanext)
2429 max1 = minnext + deltanext;
2430 if (deltanext == I32_MAX)
2431 is_inf = is_inf_internal = 1;
2432 scan = next;
2433 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2434 pars++;
2435 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2436 if ( stopmin > minnext)
2437 stopmin = min + min1;
2438 flags &= ~SCF_DO_SUBSTR;
2439 if (data)
2440 data->flags |= SCF_SEEN_ACCEPT;
2441 }
2442 if (data) {
2443 if (data_fake.flags & SF_HAS_EVAL)
2444 data->flags |= SF_HAS_EVAL;
2445 data->whilem_c = data_fake.whilem_c;
3dab1dad 2446 }
8aa23a47
YO
2447 if (flags & SCF_DO_STCLASS)
2448 cl_or(pRExC_state, &accum, &this_class);
2449 }
2450 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2451 min1 = 0;
2452 if (flags & SCF_DO_SUBSTR) {
2453 data->pos_min += min1;
2454 data->pos_delta += max1 - min1;
2455 if (max1 != min1 || is_inf)
2456 data->longest = &(data->longest_float);
2457 }
2458 min += min1;
2459 delta += max1 - min1;
2460 if (flags & SCF_DO_STCLASS_OR) {
2461 cl_or(pRExC_state, data->start_class, &accum);
2462 if (min1) {
2463 cl_and(data->start_class, and_withp);
2464 flags &= ~SCF_DO_STCLASS;
653099ff 2465 }
8aa23a47
YO
2466 }
2467 else if (flags & SCF_DO_STCLASS_AND) {
2468 if (min1) {
2469 cl_and(data->start_class, &accum);
2470 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2471 }
8aa23a47
YO
2472 else {
2473 /* Switch to OR mode: cache the old value of
2474 * data->start_class */
2475 INIT_AND_WITHP;
2476 StructCopy(data->start_class, and_withp,
2477 struct regnode_charclass_class);
2478 flags &= ~SCF_DO_STCLASS_AND;
2479 StructCopy(&accum, data->start_class,
2480 struct regnode_charclass_class);
2481 flags |= SCF_DO_STCLASS_OR;
2482 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2483 }
8aa23a47 2484 }
a3621e74 2485
8aa23a47
YO
2486 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2487 /* demq.
a3621e74 2488
8aa23a47
YO
2489 Assuming this was/is a branch we are dealing with: 'scan' now
2490 points at the item that follows the branch sequence, whatever
2491 it is. We now start at the beginning of the sequence and look
2492 for subsequences of
a3621e74 2493
8aa23a47
YO
2494 BRANCH->EXACT=>x1
2495 BRANCH->EXACT=>x2
2496 tail
a3621e74 2497
8aa23a47 2498 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2499
8aa23a47
YO
2500 If we can find such a subseqence we need to turn the first
2501 element into a trie and then add the subsequent branch exact
2502 strings to the trie.
a3621e74 2503
8aa23a47 2504 We have two cases
a3621e74 2505
8aa23a47 2506 1. patterns where the whole set of branch can be converted.
a3621e74 2507
8aa23a47 2508 2. patterns where only a subset can be converted.
a3621e74 2509
8aa23a47
YO
2510 In case 1 we can replace the whole set with a single regop
2511 for the trie. In case 2 we need to keep the start and end
2512 branchs so
a3621e74 2513
8aa23a47
YO
2514 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2515 becomes BRANCH TRIE; BRANCH X;
786e8c11 2516
8aa23a47
YO
2517 There is an additional case, that being where there is a
2518 common prefix, which gets split out into an EXACT like node
2519 preceding the TRIE node.
a3621e74 2520
8aa23a47
YO
2521 If x(1..n)==tail then we can do a simple trie, if not we make
2522 a "jump" trie, such that when we match the appropriate word
2523 we "jump" to the appopriate tail node. Essentailly we turn
2524 a nested if into a case structure of sorts.
b515a41d 2525
8aa23a47
YO
2526 */
2527
2528 int made=0;
2529 if (!re_trie_maxbuff) {
2530 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2531 if (!SvIOK(re_trie_maxbuff))
2532 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2533 }
2534 if ( SvIV(re_trie_maxbuff)>=0 ) {
2535 regnode *cur;
2536 regnode *first = (regnode *)NULL;
2537 regnode *last = (regnode *)NULL;
2538 regnode *tail = scan;
2539 U8 optype = 0;
2540 U32 count=0;
a3621e74
YO
2541
2542#ifdef DEBUGGING
8aa23a47 2543 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2544#endif
8aa23a47
YO
2545 /* var tail is used because there may be a TAIL
2546 regop in the way. Ie, the exacts will point to the
2547 thing following the TAIL, but the last branch will
2548 point at the TAIL. So we advance tail. If we
2549 have nested (?:) we may have to move through several
2550 tails.
2551 */
2552
2553 while ( OP( tail ) == TAIL ) {
2554 /* this is the TAIL generated by (?:) */
2555 tail = regnext( tail );
2556 }
a3621e74 2557
8aa23a47
YO
2558
2559 DEBUG_OPTIMISE_r({
2560 regprop(RExC_rx, mysv, tail );
2561 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2562 (int)depth * 2 + 2, "",
2563 "Looking for TRIE'able sequences. Tail node is: ",
2564 SvPV_nolen_const( mysv )
2565 );
2566 });
2567
2568 /*
2569
2570 step through the branches, cur represents each
2571 branch, noper is the first thing to be matched
2572 as part of that branch and noper_next is the
2573 regnext() of that node. if noper is an EXACT
2574 and noper_next is the same as scan (our current
2575 position in the regex) then the EXACT branch is
2576 a possible optimization target. Once we have
2577 two or more consequetive such branches we can
2578 create a trie of the EXACT's contents and stich
2579 it in place. If the sequence represents all of
2580 the branches we eliminate the whole thing and
2581 replace it with a single TRIE. If it is a
2582 subsequence then we need to stitch it in. This
2583 means the first branch has to remain, and needs
2584 to be repointed at the item on the branch chain
2585 following the last branch optimized. This could
2586 be either a BRANCH, in which case the
2587 subsequence is internal, or it could be the
2588 item following the branch sequence in which
2589 case the subsequence is at the end.
2590
2591 */
2592
2593 /* dont use tail as the end marker for this traverse */
2594 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2595 regnode * const noper = NEXTOPER( cur );
b515a41d 2596#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2597 regnode * const noper_next = regnext( noper );
b515a41d
YO
2598#endif
2599
8aa23a47
YO
2600 DEBUG_OPTIMISE_r({
2601 regprop(RExC_rx, mysv, cur);
2602 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2603 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2604
2605 regprop(RExC_rx, mysv, noper);
2606 PerlIO_printf( Perl_debug_log, " -> %s",
2607 SvPV_nolen_const(mysv));
2608
2609 if ( noper_next ) {
2610 regprop(RExC_rx, mysv, noper_next );
2611 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2612 SvPV_nolen_const(mysv));
2613 }
2614 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2615 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2616 });
2617 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2618 : PL_regkind[ OP( noper ) ] == EXACT )
2619 || OP(noper) == NOTHING )
786e8c11 2620#ifdef NOJUMPTRIE
8aa23a47 2621 && noper_next == tail
786e8c11 2622#endif
8aa23a47
YO
2623 && count < U16_MAX)
2624 {
2625 count++;
2626 if ( !first || optype == NOTHING ) {
2627 if (!first) first = cur;
2628 optype = OP( noper );
2629 } else {
2630 last = cur;
2631 }
2632 } else {
2633 if ( last ) {
2634 make_trie( pRExC_state,
2635 startbranch, first, cur, tail, count,
2636 optype, depth+1 );
2637 }
2638 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2639#ifdef NOJUMPTRIE
8aa23a47 2640 && noper_next == tail
786e8c11 2641#endif
8aa23a47
YO
2642 ){
2643 count = 1;
2644 first = cur;
2645 optype = OP( noper );
2646 } else {
2647 count = 0;
2648 first = NULL;
2649 optype = 0;
2650 }
2651 last = NULL;
2652 }
2653 }
2654 DEBUG_OPTIMISE_r({
2655 regprop(RExC_rx, mysv, cur);
2656 PerlIO_printf( Perl_debug_log,
2657 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2658 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2659
2660 });
2661 if ( last ) {
2662 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2663#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2664 if ( ((made == MADE_EXACT_TRIE &&
2665 startbranch == first)
2666 || ( first_non_open == first )) &&
2667 depth==0 ) {
2668 flags |= SCF_TRIE_RESTUDY;
2669 if ( startbranch == first
2670 && scan == tail )
2671 {
2672 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2673 }
2674 }
3dab1dad 2675#endif
8aa23a47
YO
2676 }
2677 }
2678
2679 } /* do trie */
2680
653099ff 2681 }
8aa23a47
YO
2682 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2683 scan = NEXTOPER(NEXTOPER(scan));
2684 } else /* single branch is optimized. */
2685 scan = NEXTOPER(scan);
2686 continue;
2687 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2688 scan_frame *newframe = NULL;
2689 I32 paren;
2690 regnode *start;
2691 regnode *end;
2692
2693 if (OP(scan) != SUSPEND) {
2694 /* set the pointer */
2695 if (OP(scan) == GOSUB) {
2696 paren = ARG(scan);
2697 RExC_recurse[ARG2L(scan)] = scan;
2698 start = RExC_open_parens[paren-1];
2699 end = RExC_close_parens[paren-1];
2700 } else {
2701 paren = 0;
2702 start = RExC_rx->program + 1;
2703 end = RExC_opend;
2704 }
2705 if (!recursed) {
2706 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2707 SAVEFREEPV(recursed);
2708 }
2709 if (!PAREN_TEST(recursed,paren+1)) {
2710 PAREN_SET(recursed,paren+1);
2711 Newx(newframe,1,scan_frame);
2712 } else {
2713 if (flags & SCF_DO_SUBSTR) {
2714 scan_commit(pRExC_state,data,minlenp);
2715 data->longest = &(data->longest_float);
2716 }
2717 is_inf = is_inf_internal = 1;
2718 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2719 cl_anything(pRExC_state, data->start_class);
2720 flags &= ~SCF_DO_STCLASS;
2721 }
2722 } else {
2723 Newx(newframe,1,scan_frame);
2724 paren = stopparen;
2725 start = scan+2;
2726 end = regnext(scan);
2727 }
2728 if (newframe) {
2729 assert(start);
2730 assert(end);
2731 SAVEFREEPV(newframe);
2732 newframe->next = regnext(scan);
2733 newframe->last = last;
2734 newframe->stop = stopparen;
2735 newframe->prev = frame;
2736
2737 frame = newframe;
2738 scan = start;
2739 stopparen = paren;
2740 last = end;
2741
2742 continue;
2743 }
2744 }
2745 else if (OP(scan) == EXACT) {
2746 I32 l = STR_LEN(scan);
2747 UV uc;
2748 if (UTF) {
2749 const U8 * const s = (U8*)STRING(scan);
2750 l = utf8_length(s, s + l);
2751 uc = utf8_to_uvchr(s, NULL);
2752 } else {
2753 uc = *((U8*)STRING(scan));
2754 }
2755 min += l;
2756 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2757 /* The code below prefers earlier match for fixed
2758 offset, later match for variable offset. */
2759 if (data->last_end == -1) { /* Update the start info. */
2760 data->last_start_min = data->pos_min;
2761 data->last_start_max = is_inf
2762 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2763 }
8aa23a47
YO
2764 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2765 if (UTF)
2766 SvUTF8_on(data->last_found);
2767 {
2768 SV * const sv = data->last_found;
2769 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2770 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2771 if (mg && mg->mg_len >= 0)
2772 mg->mg_len += utf8_length((U8*)STRING(scan),
2773 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2774 }
8aa23a47
YO
2775 data->last_end = data->pos_min + l;
2776 data->pos_min += l; /* As in the first entry. */
2777 data->flags &= ~SF_BEFORE_EOL;
2778 }
2779 if (flags & SCF_DO_STCLASS_AND) {
2780 /* Check whether it is compatible with what we know already! */
2781 int compat = 1;
2782
2783 if (uc >= 0x100 ||
2784 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2785 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2786 && (!(data->start_class->flags & ANYOF_FOLD)
2787 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2788 )
2789 compat = 0;
2790 ANYOF_CLASS_ZERO(data->start_class);
2791 ANYOF_BITMAP_ZERO(data->start_class);
2792 if (compat)
2793 ANYOF_BITMAP_SET(data->start_class, uc);
2794 data->start_class->flags &= ~ANYOF_EOS;
2795 if (uc < 0x100)
2796 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2797 }
2798 else if (flags & SCF_DO_STCLASS_OR) {
2799 /* false positive possible if the class is case-folded */
2800 if (uc < 0x100)
2801 ANYOF_BITMAP_SET(data->start_class, uc);
2802 else
2803 data->start_class->flags |= ANYOF_UNICODE_ALL;
2804 data->start_class->flags &= ~ANYOF_EOS;
2805 cl_and(data->start_class, and_withp);
2806 }
2807 flags &= ~SCF_DO_STCLASS;
2808 }
2809 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2810 I32 l = STR_LEN(scan);
2811 UV uc = *((U8*)STRING(scan));
2812
2813 /* Search for fixed substrings supports EXACT only. */
2814 if (flags & SCF_DO_SUBSTR) {
2815 assert(data);
2816 scan_commit(pRExC_state, data, minlenp);
2817 }
2818 if (UTF) {
2819 const U8 * const s = (U8 *)STRING(scan);
2820 l = utf8_length(s, s + l);
2821 uc = utf8_to_uvchr(s, NULL);
2822 }
2823 min += l;
2824 if (flags & SCF_DO_SUBSTR)
2825 data->pos_min += l;
2826 if (flags & SCF_DO_STCLASS_AND) {
2827 /* Check whether it is compatible with what we know already! */
2828 int compat = 1;
2829
2830 if (uc >= 0x100 ||
2831 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2832 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2833 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2834 compat = 0;
2835 ANYOF_CLASS_ZERO(data->start_class);
2836 ANYOF_BITMAP_ZERO(data->start_class);
2837 if (compat) {
2838 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2839 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2840 data->start_class->flags |= ANYOF_FOLD;
2841 if (OP(scan) == EXACTFL)
2842 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2843 }
8aa23a47
YO
2844 }
2845 else if (flags & SCF_DO_STCLASS_OR) {
2846 if (data->start_class->flags & ANYOF_FOLD) {
2847 /* false positive possible if the class is case-folded.
2848 Assume that the locale settings are the same... */
1aa99e6b
IH
2849 if (uc < 0x100)
2850 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2851 data->start_class->flags &= ~ANYOF_EOS;
2852 }
8aa23a47 2853 cl_and(data->start_class, and_withp);
653099ff 2854 }
8aa23a47
YO
2855 flags &= ~SCF_DO_STCLASS;
2856 }
2857 else if (strchr((const char*)PL_varies,OP(scan))) {
2858 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2859 I32 f = flags, pos_before = 0;
2860 regnode * const oscan = scan;
2861 struct regnode_charclass_class this_class;
2862 struct regnode_charclass_class *oclass = NULL;
2863 I32 next_is_eval = 0;
2864
2865 switch (PL_regkind[OP(scan)]) {
2866 case WHILEM: /* End of (?:...)* . */
2867 scan = NEXTOPER(scan);
2868 goto finish;
2869 case PLUS:
2870 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2871 next = NEXTOPER(scan);
2872 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2873 mincount = 1;
2874 maxcount = REG_INFTY;
2875 next = regnext(scan);
2876 scan = NEXTOPER(scan);
2877 goto do_curly;
2878 }
2879 }
2880 if (flags & SCF_DO_SUBSTR)
2881 data->pos_min++;
2882 min++;
2883 /* Fall through. */
2884 case STAR:
2885 if (flags & SCF_DO_STCLASS) {
2886 mincount = 0;
2887 maxcount = REG_INFTY;
2888 next = regnext(scan);
2889 scan = NEXTOPER(scan);
2890 goto do_curly;
2891 }
2892 is_inf = is_inf_internal = 1;
2893 scan = regnext(scan);
c277df42 2894 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2895 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2896 data->longest = &(data->longest_float);
c277df42 2897 }
8aa23a47
YO
2898 goto optimize_curly_tail;
2899 case CURLY:
2900 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2901 && (scan->flags == stopparen))
2902 {
2903 mincount = 1;
2904 maxcount = 1;
2905 } else {
2906 mincount = ARG1(scan);
2907 maxcount = ARG2(scan);
653099ff 2908 }
8aa23a47
YO
2909 next = regnext(scan);
2910 if (OP(scan) == CURLYX) {
2911 I32 lp = (data ? *(data->last_closep) : 0);
2912 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2913 }
8aa23a47
YO
2914 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2915 next_is_eval = (OP(scan) == EVAL);
2916 do_curly:
2917 if (flags & SCF_DO_SUBSTR) {
2918 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2919 pos_before = data->pos_min;
b45f050a 2920 }
8aa23a47
YO
2921 if (data) {
2922 fl = data->flags;
2923 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2924 if (is_inf)
2925 data->flags |= SF_IS_INF;
2926 }
2927 if (flags & SCF_DO_STCLASS) {
2928 cl_init(pRExC_state, &this_class);
2929 oclass = data->start_class;
2930 data->start_class = &this_class;
2931 f |= SCF_DO_STCLASS_AND;
2932 f &= ~SCF_DO_STCLASS_OR;
2933 }
2934 /* These are the cases when once a subexpression
2935 fails at a particular position, it cannot succeed
2936 even after backtracking at the enclosing scope.
2937
2938 XXXX what if minimal match and we are at the
2939 initial run of {n,m}? */
2940 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2941 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2942
8aa23a47
YO
2943 /* This will finish on WHILEM, setting scan, or on NULL: */
2944 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2945 last, data, stopparen, recursed, NULL,
2946 (mincount == 0
2947 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2948
8aa23a47
YO
2949 if (flags & SCF_DO_STCLASS)
2950 data->start_class = oclass;
2951 if (mincount == 0 || minnext == 0) {
2952 if (flags & SCF_DO_STCLASS_OR) {
2953 cl_or(pRExC_state, data->start_class, &this_class);
2954 }
2955 else if (flags & SCF_DO_STCLASS_AND) {
2956 /* Switch to OR mode: cache the old value of
2957 * data->start_class */
2958 INIT_AND_WITHP;
2959 StructCopy(data->start_class, and_withp,
2960 struct regnode_charclass_class);
2961 flags &= ~SCF_DO_STCLASS_AND;
2962 StructCopy(&this_class, data->start_class,
2963 struct regnode_charclass_class);
2964 flags |= SCF_DO_STCLASS_OR;
2965 data->start_class->flags |= ANYOF_EOS;
2966 }
2967 } else { /* Non-zero len */
2968 if (flags & SCF_DO_STCLASS_OR) {
2969 cl_or(pRExC_state, data->start_class, &this_class);
2970 cl_and(data->start_class, and_withp);
2971 }
2972 else if (flags & SCF_DO_STCLASS_AND)
2973 cl_and(data->start_class, &this_class);
2974 flags &= ~SCF_DO_STCLASS;
2975 }
2976 if (!scan) /* It was not CURLYX, but CURLY. */
2977 scan = next;
2978 if ( /* ? quantifier ok, except for (?{ ... }) */
2979 (next_is_eval || !(mincount == 0 && maxcount == 1))
2980 && (minnext == 0) && (deltanext == 0)
2981 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2982 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2983 && ckWARN(WARN_REGEXP))
2984 {
2985 vWARN(RExC_parse,
2986 "Quantifier unexpected on zero-length expression");
2987 }
2988
2989 min += minnext * mincount;
2990 is_inf_internal |= ((maxcount == REG_INFTY
2991 && (minnext + deltanext) > 0)
2992 || deltanext == I32_MAX);
2993 is_inf |= is_inf_internal;
2994 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2995
2996 /* Try powerful optimization CURLYX => CURLYN. */
2997 if ( OP(oscan) == CURLYX && data
2998 && data->flags & SF_IN_PAR
2999 && !(data->flags & SF_HAS_EVAL)
3000 && !deltanext && minnext == 1 ) {
3001 /* Try to optimize to CURLYN. */
3002 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3003 regnode * const nxt1 = nxt;
497b47a8 3004#ifdef DEBUGGING
8aa23a47 3005 regnode *nxt2;
497b47a8 3006#endif
c277df42 3007
8aa23a47
YO
3008 /* Skip open. */
3009 nxt = regnext(nxt);
3010 if (!strchr((const char*)PL_simple,OP(nxt))
3011 && !(PL_regkind[OP(nxt)] == EXACT
3012 && STR_LEN(nxt) == 1))
3013 goto nogo;
497b47a8 3014#ifdef DEBUGGING
8aa23a47 3015 nxt2 = nxt;
497b47a8 3016#endif
8aa23a47
YO
3017 nxt = regnext(nxt);
3018 if (OP(nxt) != CLOSE)
3019 goto nogo;
3020 if (RExC_open_parens) {
3021 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3022 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3023 }
3024 /* Now we know that nxt2 is the only contents: */
3025 oscan->flags = (U8)ARG(nxt);
3026 OP(oscan) = CURLYN;
3027 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3028
c277df42 3029#ifdef DEBUGGING
8aa23a47
YO
3030 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3031 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3032 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3033 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3034 OP(nxt + 1) = OPTIMIZED; /* was count. */
3035 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3036#endif
8aa23a47
YO
3037 }
3038 nogo:
3039
3040 /* Try optimization CURLYX => CURLYM. */
3041 if ( OP(oscan) == CURLYX && data
3042 && !(data->flags & SF_HAS_PAR)
3043 && !(data->flags & SF_HAS_EVAL)
3044 && !deltanext /* atom is fixed width */
3045 && minnext != 0 /* CURLYM can't handle zero width */
3046 ) {
3047 /* XXXX How to optimize if data == 0? */
3048 /* Optimize to a simpler form. */
3049 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3050 regnode *nxt2;
3051
3052 OP(oscan) = CURLYM;
3053 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3054 && (OP(nxt2) != WHILEM))
3055 nxt = nxt2;
3056 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3057 /* Need to optimize away parenths. */
3058 if (data->flags & SF_IN_PAR) {
3059 /* Set the parenth number. */
3060 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3061
3062 if (OP(nxt) != CLOSE)
3063 FAIL("Panic opt close");
3064 oscan->flags = (U8)ARG(nxt);
3065 if (RExC_open_parens) {
3066 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3067 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3068 }
8aa23a47
YO
3069 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3070 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3071
c277df42 3072#ifdef DEBUGGING
8aa23a47
YO
3073 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3074 OP(nxt + 1) = OPTIMIZED; /* was count. */
3075 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3076 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3077#endif
c277df42 3078#if 0
8aa23a47
YO
3079 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3080 regnode *nnxt = regnext(nxt1);
3081
3082 if (nnxt == nxt) {
3083 if (reg_off_by_arg[OP(nxt1)])
3084 ARG_SET(nxt1, nxt2 - nxt1);
3085 else if (nxt2 - nxt1 < U16_MAX)
3086 NEXT_OFF(nxt1) = nxt2 - nxt1;
3087 else
3088 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3089 }
8aa23a47 3090 nxt1 = nnxt;
c277df42 3091 }
5d1c421c 3092#endif
8aa23a47
YO
3093 /* Optimize again: */
3094 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3095 NULL, stopparen, recursed, NULL, 0,depth+1);
3096 }
3097 else
3098 oscan->flags = 0;
3099 }
3100 else if ((OP(oscan) == CURLYX)
3101 && (flags & SCF_WHILEM_VISITED_POS)
3102 /* See the comment on a similar expression above.
3103 However, this time it not a subexpression
3104 we care about, but the expression itself. */
3105 && (maxcount == REG_INFTY)
3106 && data && ++data->whilem_c < 16) {
3107 /* This stays as CURLYX, we can put the count/of pair. */
3108 /* Find WHILEM (as in regexec.c) */
3109 regnode *nxt = oscan + NEXT_OFF(oscan);
3110
3111 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3112 nxt += ARG(nxt);
3113 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3114 | (RExC_whilem_seen << 4)); /* On WHILEM */
3115 }
3116 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3117 pars++;
3118 if (flags & SCF_DO_SUBSTR) {
3119 SV *last_str = NULL;
3120 int counted = mincount != 0;
a0ed51b3 3121
8aa23a47
YO
3122 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3123#if defined(SPARC64_GCC_WORKAROUND)
3124 I32 b = 0;
3125 STRLEN l = 0;
3126 const char *s = NULL;
3127 I32 old = 0;
b515a41d 3128
8aa23a47
YO
3129 if (pos_before >= data->last_start_min)
3130 b = pos_before;
3131 else
3132 b = data->last_start_min;
b515a41d 3133
8aa23a47
YO
3134 l = 0;
3135 s = SvPV_const(data->last_found, l);
3136 old = b - data->last_start_min;
3137
3138#else
3139 I32 b = pos_before >= data->last_start_min
3140 ? pos_before : data->last_start_min;
3141 STRLEN l;
3142 const char * const s = SvPV_const(data->last_found, l);
3143 I32 old = b - data->last_start_min;
3144#endif
3145
3146 if (UTF)
3147 old = utf8_hop((U8*)s, old) - (U8*)s;
3148
3149 l -= old;
3150 /* Get the added string: */
3151 last_str = newSVpvn(s + old, l);
3152 if (UTF)
3153 SvUTF8_on(last_str);
3154 if (deltanext == 0 && pos_before == b) {
3155 /* What was added is a constant string */
3156 if (mincount > 1) {
3157 SvGROW(last_str, (mincount * l) + 1);
3158 repeatcpy(SvPVX(last_str) + l,
3159 SvPVX_const(last_str), l, mincount - 1);
3160 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3161 /* Add additional parts. */
3162 SvCUR_set(data->last_found,
3163 SvCUR(data->last_found) - l);
3164 sv_catsv(data->last_found, last_str);
3165 {
3166 SV * sv = data->last_found;
3167 MAGIC *mg =
3168 SvUTF8(sv) && SvMAGICAL(sv) ?
3169 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3170 if (mg && mg->mg_len >= 0)
3171 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3172 }
8aa23a47 3173 data->last_end += l * (mincount - 1);
b515a41d 3174 }
8aa23a47
YO
3175 } else {
3176 /* start offset must point into the last copy */
3177 data->last_start_min += minnext * (mincount - 1);
3178 data->last_start_max += is_inf ? I32_MAX
3179 : (maxcount - 1) * (minnext + data->pos_delta);
3180 }
c277df42 3181 }
8aa23a47
YO
3182 /* It is counted once already... */
3183 data->pos_min += minnext * (mincount - counted);
3184 data->pos_delta += - counted * deltanext +
3185 (minnext + deltanext) * maxcount - minnext * mincount;
3186 if (mincount != maxcount) {
3187 /* Cannot extend fixed substrings found inside
3188 the group. */
3189 scan_commit(pRExC_state,data,minlenp);
3190 if (mincount && last_str) {
3191 SV * const sv = data->last_found;
3192 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3193 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3194
3195 if (mg)
3196 mg->mg_len = -1;
3197 sv_setsv(sv, last_str);
3198 data->last_end = data->pos_min;
3199 data->last_start_min =
3200 data->pos_min - CHR_SVLEN(last_str);
3201 data->last_start_max = is_inf
3202 ? I32_MAX
3203 : data->pos_min + data->pos_delta
3204 - CHR_SVLEN(last_str);
3205 }
3206 data->longest = &(data->longest_float);
3207 }
3208 SvREFCNT_dec(last_str);
c277df42 3209 }
8aa23a47
YO
3210 if (data && (fl & SF_HAS_EVAL))
3211 data->flags |= SF_HAS_EVAL;
3212 optimize_curly_tail:
3213 if (OP(oscan) != CURLYX) {
3214 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3215 && NEXT_OFF(next))
3216 NEXT_OFF(oscan) += NEXT_OFF(next);
3217 }
3218 continue;
3219 default: /* REF and CLUMP only? */
3220 if (flags & SCF_DO_SUBSTR) {
3221 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3222 data->longest = &(data->longest_float);
3223 }
3224 is_inf = is_inf_internal = 1;
3225 if (flags & SCF_DO_STCLASS_OR)
3226 cl_anything(pRExC_state, data->start_class);
3227 flags &= ~SCF_DO_STCLASS;
3228 break;
c277df42 3229 }
8aa23a47
YO
3230 }
3231 else if (strchr((const char*)PL_simple,OP(scan))) {
3232 int value = 0;
653099ff 3233
8aa23a47
YO
3234 if (flags & SCF_DO_SUBSTR) {
3235 scan_commit(pRExC_state,data,minlenp);
3236 data->pos_min++;
3237 }
3238 min++;
3239 if (flags & SCF_DO_STCLASS) {
3240 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3241
8aa23a47
YO
3242 /* Some of the logic below assumes that switching
3243 locale on will only add false positives. */
3244 switch (PL_regkind[OP(scan)]) {
3245 case SANY:
3246 default:
3247 do_default:
3248 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3249 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3250 cl_anything(pRExC_state, data->start_class);
3251 break;
3252 case REG_ANY:
3253 if (OP(scan) == SANY)
3254 goto do_default;
3255 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3256 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3257 || (data->start_class->flags & ANYOF_CLASS));
3258 cl_anything(pRExC_state, data->start_class);
653099ff 3259 }
8aa23a47
YO
3260 if (flags & SCF_DO_STCLASS_AND || !value)
3261 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3262 break;
3263 case ANYOF:
3264 if (flags & SCF_DO_STCLASS_AND)
3265 cl_and(data->start_class,
3266 (struct regnode_charclass_class*)scan);
653099ff 3267 else
8aa23a47
YO
3268 cl_or(pRExC_state, data->start_class,
3269 (struct regnode_charclass_class*)scan);
3270 break;
3271 case ALNUM:
3272 if (flags & SCF_DO_STCLASS_AND) {
3273 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3274 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3275 for (value = 0; value < 256; value++)
3276 if (!isALNUM(value))
3277 ANYOF_BITMAP_CLEAR(data->start_class, value);
3278 }
653099ff 3279 }
8aa23a47
YO
3280 else {
3281 if (data->start_class->flags & ANYOF_LOCALE)
3282 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3283 else {
3284 for (value = 0; value < 256; value++)
3285 if (isALNUM(value))
3286 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3287 }
8aa23a47
YO
3288 }
3289 break;
3290 case ALNUML:
3291 if (flags & SCF_DO_STCLASS_AND) {
3292 if (data->start_class->flags & ANYOF_LOCALE)
3293 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3294 }
3295 else {
3296 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3297 data->start_class->flags |= ANYOF_LOCALE;
3298 }
3299 break;
3300 case NALNUM:
3301 if (flags & SCF_DO_STCLASS_AND) {
3302 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3303 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3304 for (value = 0; value < 256; value++)
3305 if (isALNUM(value))
3306 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3307 }
3308 }
8aa23a47
YO
3309 else {
3310 if (data->start_class->flags & ANYOF_LOCALE)
3311 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3312 else {
3313 for (value = 0; value < 256; value++)
3314 if (!isALNUM(value))
3315 ANYOF_BITMAP_SET(data->start_class, value);
3316 }
653099ff 3317 }
8aa23a47
YO
3318 break;
3319 case NALNUML:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 if (data->start_class->flags & ANYOF_LOCALE)
3322 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3323 }
8aa23a47
YO
3324 else {
3325 data->start_class->flags |= ANYOF_LOCALE;
3326 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3327 }
3328 break;
3329 case SPACE:
3330 if (flags & SCF_DO_STCLASS_AND) {
3331 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3333 for (value = 0; value < 256; value++)
3334 if (!isSPACE(value))
3335 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3336 }
3337 }
8aa23a47
YO
3338 else {
3339 if (data->start_class->flags & ANYOF_LOCALE)
3340 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3341 else {
3342 for (value = 0; value < 256; value++)
3343 if (isSPACE(value))
3344 ANYOF_BITMAP_SET(data->start_class, value);
3345 }
653099ff 3346 }
8aa23a47
YO
3347 break;
3348 case SPACEL:
3349 if (flags & SCF_DO_STCLASS_AND) {
3350 if (data->start_class->flags & ANYOF_LOCALE)
3351 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3352 }
3353 else {
3354 data->start_class->flags |= ANYOF_LOCALE;
3355 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3356 }
3357 break;
3358 case NSPACE:
3359 if (flags & SCF_DO_STCLASS_AND) {
3360 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3362 for (value = 0; value < 256; value++)
3363 if (isSPACE(value))
3364 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3365 }
8aa23a47
YO
3366 }
3367 else {
3368 if (data->start_class->flags & ANYOF_LOCALE)
3369 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3370 else {
3371 for (value = 0; value < 256; value++)
3372 if (!isSPACE(value))
3373 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3374 }
3375 }
8aa23a47
YO
3376 break;
3377 case NSPACEL:
3378 if (flags & SCF_DO_STCLASS_AND) {
3379 if (data->start_class->flags & ANYOF_LOCALE) {
3380 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3381 for (value = 0; value < 256; value++)
3382 if (!isSPACE(value))
3383 ANYOF_BITMAP_CLEAR(data->start_class, value);
3384 }
653099ff 3385 }
8aa23a47
YO
3386 else {
3387 data->start_class->flags |= ANYOF_LOCALE;
3388 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3389 }
3390 break;
3391 case DIGIT:
3392 if (flags & SCF_DO_STCLASS_AND) {
3393 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3394 for (value = 0; value < 256; value++)
3395 if (!isDIGIT(value))
3396 ANYOF_BITMAP_CLEAR(data->start_class, value);
3397 }
3398 else {
3399 if (data->start_class->flags & ANYOF_LOCALE)
3400 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3401 else {
3402 for (value = 0; value < 256; value++)
3403 if (isDIGIT(value))
3404 ANYOF_BITMAP_SET(data->start_class, value);
3405 }
3406 }
3407 break;
3408 case NDIGIT:
3409 if (flags & SCF_DO_STCLASS_AND) {
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3411 for (value = 0; value < 256; value++)
3412 if (isDIGIT(value))
3413 ANYOF_BITMAP_CLEAR(data->start_class, value);
3414 }
3415 else {
3416 if (data->start_class->flags & ANYOF_LOCALE)
3417 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3418 else {
3419 for (value = 0; value < 256; value++)
3420 if (!isDIGIT(value))
3421 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3422 }
3423 }
8aa23a47
YO
3424 break;
3425 }
3426 if (flags & SCF_DO_STCLASS_OR)
3427 cl_and(data->start_class, and_withp);
3428 flags &= ~SCF_DO_STCLASS;
3429 }
3430 }
3431 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3432 data->flags |= (OP(scan) == MEOL
3433 ? SF_BEFORE_MEOL
3434 : SF_BEFORE_SEOL);
3435 }
3436 else if ( PL_regkind[OP(scan)] == BRANCHJ
3437 /* Lookbehind, or need to calculate parens/evals/stclass: */
3438 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3439 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3440 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3441 || OP(scan) == UNLESSM )
3442 {
3443 /* Negative Lookahead/lookbehind
3444 In this case we can't do fixed string optimisation.
3445 */
1de06328 3446
8aa23a47
YO
3447 I32 deltanext, minnext, fake = 0;
3448 regnode *nscan;
3449 struct regnode_charclass_class intrnl;
3450 int f = 0;
1de06328 3451
8aa23a47
YO
3452 data_fake.flags = 0;
3453 if (data) {
3454 data_fake.whilem_c = data->whilem_c;
3455 data_fake.last_closep = data->last_closep;
c277df42 3456 }
8aa23a47
YO
3457 else
3458 data_fake.last_closep = &fake;
3459 if ( flags & SCF_DO_STCLASS && !scan->flags
3460 && OP(scan) == IFMATCH ) { /* Lookahead */
3461 cl_init(pRExC_state, &intrnl);
3462 data_fake.start_class = &intrnl;
3463 f |= SCF_DO_STCLASS_AND;
3464 }
3465 if (flags & SCF_WHILEM_VISITED_POS)
3466 f |= SCF_WHILEM_VISITED_POS;
3467 next = regnext(scan);
3468 nscan = NEXTOPER(NEXTOPER(scan));
3469 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3470 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3471 if (scan->flags) {
3472 if (deltanext) {
3473 vFAIL("Variable length lookbehind not implemented");
3474 }
3475 else if (minnext > (I32)U8_MAX) {
3476 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3477 }
3478 scan->flags = (U8)minnext;
3479 }
3480 if (data) {
3481 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3482 pars++;
3483 if (data_fake.flags & SF_HAS_EVAL)
3484 data->flags |= SF_HAS_EVAL;
3485 data->whilem_c = data_fake.whilem_c;
3486 }
3487 if (f & SCF_DO_STCLASS_AND) {
3488 const int was = (data->start_class->flags & ANYOF_EOS);
3489
3490 cl_and(data->start_class, &intrnl);
3491 if (was)
3492 data->start_class->flags |= ANYOF_EOS;
3493 }
cb434fcc 3494 }
8aa23a47
YO
3495#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3496 else {
3497 /* Positive Lookahead/lookbehind
3498 In this case we can do fixed string optimisation,
3499 but we must be careful about it. Note in the case of
3500 lookbehind the positions will be offset by the minimum
3501 length of the pattern, something we won't know about
3502 until after the recurse.
3503 */
3504 I32 deltanext, fake = 0;
3505 regnode *nscan;
3506 struct regnode_charclass_class intrnl;
3507 int f = 0;
3508 /* We use SAVEFREEPV so that when the full compile
3509 is finished perl will clean up the allocated
3510 minlens when its all done. This was we don't
3511 have to worry about freeing them when we know
3512 they wont be used, which would be a pain.
3513 */
3514 I32 *minnextp;
3515 Newx( minnextp, 1, I32 );
3516 SAVEFREEPV(minnextp);
3517
3518 if (data) {
3519 StructCopy(data, &data_fake, scan_data_t);
3520 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3521 f |= SCF_DO_SUBSTR;
3522 if (scan->flags)
3523 scan_commit(pRExC_state, &data_fake,minlenp);
3524 data_fake.last_found=newSVsv(data->last_found);
3525 }
3526 }
3527 else
3528 data_fake.last_closep = &fake;
3529 data_fake.flags = 0;
3530 if (is_inf)
3531 data_fake.flags |= SF_IS_INF;
3532 if ( flags & SCF_DO_STCLASS && !scan->flags
3533 && OP(scan) == IFMATCH ) { /* Lookahead */
3534 cl_init(pRExC_state, &intrnl);
3535 data_fake.start_class = &intrnl;
3536 f |= SCF_DO_STCLASS_AND;
3537 }
3538 if (flags & SCF_WHILEM_VISITED_POS)
3539 f |= SCF_WHILEM_VISITED_POS;
3540 next = regnext(scan);
3541 nscan = NEXTOPER(NEXTOPER(scan));
3542
3543 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3544 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3545 if (scan->flags) {
3546 if (deltanext) {
3547 vFAIL("Variable length lookbehind not implemented");
3548 }
3549 else if (*minnextp > (I32)U8_MAX) {
3550 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3551 }
3552 scan->flags = (U8)*minnextp;
3553 }
3554
3555 *minnextp += min;
3556
3557 if (f & SCF_DO_STCLASS_AND) {
3558 const int was = (data->start_class->flags & ANYOF_EOS);
3559
3560 cl_and(data->start_class, &intrnl);
3561 if (was)
3562 data->start_class->flags |= ANYOF_EOS;
3563 }
3564 if (data) {
3565 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3566 pars++;
3567 if (data_fake.flags & SF_HAS_EVAL)
3568 data->flags |= SF_HAS_EVAL;
3569 data->whilem_c = data_fake.whilem_c;
3570 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3571 if (RExC_rx->minlen<*minnextp)
3572 RExC_rx->minlen=*minnextp;
3573 scan_commit(pRExC_state, &data_fake, minnextp);
3574 SvREFCNT_dec(data_fake.last_found);
3575
3576 if ( data_fake.minlen_fixed != minlenp )
3577 {
3578 data->offset_fixed= data_fake.offset_fixed;
3579 data->minlen_fixed= data_fake.minlen_fixed;
3580 data->lookbehind_fixed+= scan->flags;
3581 }
3582 if ( data_fake.minlen_float != minlenp )
3583 {
3584 data->minlen_float= data_fake.minlen_float;
3585 data->offset_float_min=data_fake.offset_float_min;
3586 data->offset_float_max=data_fake.offset_float_max;
3587 data->lookbehind_float+= scan->flags;
3588 }
3589 }
3590 }
3591
3592
40d049e4 3593 }
8aa23a47
YO
3594#endif
3595 }
3596 else if (OP(scan) == OPEN) {
3597 if (stopparen != (I32)ARG(scan))
3598 pars++;
3599 }
3600 else if (OP(scan) == CLOSE) {
3601 if (stopparen == (I32)ARG(scan)) {
3602 break;
3603 }
3604 if ((I32)ARG(scan) == is_par) {
3605 next = regnext(scan);
b515a41d 3606
8aa23a47
YO
3607 if ( next && (OP(next) != WHILEM) && next < last)
3608 is_par = 0; /* Disable optimization */
40d049e4 3609 }
8aa23a47
YO
3610 if (data)
3611 *(data->last_closep) = ARG(scan);
3612 }
3613 else if (OP(scan) == EVAL) {
c277df42
IZ
3614 if (data)
3615 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3616 }
3617 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3618 if (flags & SCF_DO_SUBSTR) {
3619 scan_commit(pRExC_state,data,minlenp);
3620 flags &= ~SCF_DO_SUBSTR;
40d049e4 3621 }
8aa23a47
YO
3622 if (data && OP(scan)==ACCEPT) {
3623 data->flags |= SCF_SEEN_ACCEPT;
3624 if (stopmin > min)
3625 stopmin = min;
e2e6a0f1 3626 }
8aa23a47
YO
3627 }
3628 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3629 {
0f5d15d6 3630 if (flags & SCF_DO_SUBSTR) {
1de06328 3631 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3632 data->longest = &(data->longest_float);
3633 }
3634 is_inf = is_inf_internal = 1;
653099ff 3635 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3636 cl_anything(pRExC_state, data->start_class);
96776eda 3637 flags &= ~SCF_DO_STCLASS;
8aa23a47 3638 }
786e8c11 3639#ifdef TRIE_STUDY_OPT
40d049e4 3640#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3641 else if (PL_regkind[OP(scan)] == TRIE) {
3642 /* NOTE - There is similar code to this block above for handling
3643 BRANCH nodes on the initial study. If you change stuff here
3644 check there too. */
3645 regnode *trie_node= scan;
3646 regnode *tail= regnext(scan);
3647 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3648 I32 max1 = 0, min1 = I32_MAX;
3649 struct regnode_charclass_class accum;
3650
3651 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3652 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3653 if (flags & SCF_DO_STCLASS)
3654 cl_init_zero(pRExC_state, &accum);
3655
3656 if (!trie->jump) {
3657 min1= trie->minlen;
3658 max1= trie->maxlen;
3659 } else {
3660 const regnode *nextbranch= NULL;
3661 U32 word;
3662
3663 for ( word=1 ; word <= trie->wordcount ; word++)
3664 {
3665 I32 deltanext=0, minnext=0, f = 0, fake;
3666 struct regnode_charclass_class this_class;
3667
3668 data_fake.flags = 0;
3669 if (data) {
3670 data_fake.whilem_c = data->whilem_c;
3671 data_fake.last_closep = data->last_closep;
3672 }
3673 else
3674 data_fake.last_closep = &fake;
3675
3676 if (flags & SCF_DO_STCLASS) {
3677 cl_init(pRExC_state, &this_class);
3678 data_fake.start_class = &this_class;
3679 f = SCF_DO_STCLASS_AND;
3680 }
3681 if (flags & SCF_WHILEM_VISITED_POS)
3682 f |= SCF_WHILEM_VISITED_POS;
3683
3684 if (trie->jump[word]) {
3685 if (!nextbranch)
3686 nextbranch = trie_node + trie->jump[0];
3687 scan= trie_node + trie->jump[word];
3688 /* We go from the jump point to the branch that follows
3689 it. Note this means we need the vestigal unused branches
3690 even though they arent otherwise used.
3691 */
3692 minnext = study_chunk(pRExC_state, &scan, minlenp,
3693 &deltanext, (regnode *)nextbranch, &data_fake,
3694 stopparen, recursed, NULL, f,depth+1);
3695 }
3696 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3697 nextbranch= regnext((regnode*)nextbranch);
3698
3699 if (min1 > (I32)(minnext + trie->minlen))
3700 min1 = minnext + trie->minlen;
3701 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3702 max1 = minnext + deltanext + trie->maxlen;
3703 if (deltanext == I32_MAX)
3704 is_inf = is_inf_internal = 1;
3705
3706 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3707 pars++;
3708 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3709 if ( stopmin > min + min1)
3710 stopmin = min + min1;
3711 flags &= ~SCF_DO_SUBSTR;
3712 if (data)
3713 data->flags |= SCF_SEEN_ACCEPT;
3714 }
3715 if (data) {
3716 if (data_fake.flags & SF_HAS_EVAL)
3717 data->flags |= SF_HAS_EVAL;
3718 data->whilem_c = data_fake.whilem_c;
3719 }
3720 if (flags & SCF_DO_STCLASS)
3721 cl_or(pRExC_state, &accum, &this_class);
3722 }
3723 }
3724 if (flags & SCF_DO_SUBSTR) {
3725 data->pos_min += min1;
3726 data->pos_delta += max1 - min1;
3727 if (max1 != min1 || is_inf)
3728 data->longest = &(data->longest_float);
3729 }
3730 min += min1;
3731 delta += max1 - min1;
3732 if (flags & SCF_DO_STCLASS_OR) {
3733 cl_or(pRExC_state, data->start_class, &accum);
3734 if (min1) {
3735 cl_and(data->start_class, and_withp);
3736 flags &= ~SCF_DO_STCLASS;
3737 }
3738 }
3739 else if (flags & SCF_DO_STCLASS_AND) {
3740 if (min1) {
3741 cl_and(data->start_class, &accum);
3742 flags &= ~SCF_DO_STCLASS;
3743 }
3744 else {
3745 /* Switch to OR mode: cache the old value of
3746 * data->start_class */
3747 INIT_AND_WITHP;
3748 StructCopy(data->start_class, and_withp,
3749 struct regnode_charclass_class);
3750 flags &= ~SCF_DO_STCLASS_AND;
3751 StructCopy(&accum, data->start_class,
3752 struct regnode_charclass_class);
3753 flags |= SCF_DO_STCLASS_OR;
3754 data->start_class->flags |= ANYOF_EOS;
3755 }
3756 }
3757 scan= tail;
3758 continue;
3759 }
786e8c11 3760#else
8aa23a47
YO
3761 else if (PL_regkind[OP(scan)] == TRIE) {
3762 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3763 U8*bang=NULL;
3764
3765 min += trie->minlen;
3766 delta += (trie->maxlen - trie->minlen);
3767 flags &= ~SCF_DO_STCLASS; /* xxx */
3768 if (flags & SCF_DO_SUBSTR) {
3769 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3770 data->pos_min += trie->minlen;
3771 data->pos_delta += (trie->maxlen - trie->minlen);
3772 if (trie->maxlen != trie->minlen)
3773 data->longest = &(data->longest_float);
3774 }
3775 if (trie->jump) /* no more substrings -- for now /grr*/
3776 flags &= ~SCF_DO_SUBSTR;
b515a41d 3777 }
8aa23a47
YO
3778#endif /* old or new */
3779#endif /* TRIE_STUDY_OPT */
3780 /* Else: zero-length, ignore. */
3781 scan = regnext(scan);
3782 }
3783 if (frame) {
3784 last = frame->last;
3785 scan = frame->next;
3786 stopparen = frame->stop;
3787 frame = frame->prev;
3788 goto fake_study_recurse;
c277df42
IZ
3789 }
3790
3791 finish:
8aa23a47
YO
3792 assert(!frame);
3793
c277df42 3794 *scanp = scan;
aca2d497 3795 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3796 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3797 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3798 if (is_par > (I32)U8_MAX)
c277df42
IZ
3799 is_par = 0;
3800 if (is_par && pars==1 && data) {
3801 data->flags |= SF_IN_PAR;
3802 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3803 }
3804 else if (pars && data) {
c277df42
IZ
3805 data->flags |= SF_HAS_PAR;
3806 data->flags &= ~SF_IN_PAR;
3807 }
653099ff 3808 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3809 cl_and(data->start_class, and_withp);
786e8c11
YO
3810 if (flags & SCF_TRIE_RESTUDY)
3811 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3812
3813 DEBUG_STUDYDATA(data,depth);
3814
e2e6a0f1 3815 return min < stopmin ? min : stopmin;
c277df42
IZ
3816}
3817
76e3520e 3818STATIC I32
5f66b61c 3819S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3820{
830247a4 3821 if (RExC_rx->data) {
2eb97020 3822 const U32 count = RExC_rx->data->count;
b81d288d 3823 Renewc(RExC_rx->data,
2eb97020 3824 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3825 char, struct reg_data);
2eb97020 3826 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3827 RExC_rx->data->count += n;
a0ed51b3
LW
3828 }
3829 else {
a02a5408 3830 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3831 char, struct reg_data);
a02a5408 3832 Newx(RExC_rx->data->what, n, U8);
830247a4 3833 RExC_rx->data->count = n;
c277df42 3834 }
830247a4
IZ
3835 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3836 return RExC_rx->data->count - n;
c277df42
IZ
3837}
3838
76234dfb 3839#ifndef PERL_IN_XSUB_RE
d88dccdf 3840void
864dbfa3 3841Perl_reginitcolors(pTHX)
d88dccdf 3842{
97aff369 3843 dVAR;
1df70142 3844 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3845 if (s) {
1df70142
AL
3846 char *t = savepv(s);
3847 int i = 0;
3848 PL_colors[0] = t;
d88dccdf 3849 while (++i < 6) {
1df70142
AL
3850 t = strchr(t, '\t');
3851 if (t) {
3852 *t = '\0';
3853 PL_colors[i] = ++t;
d88dccdf
IZ
3854 }
3855 else
1df70142 3856 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3857 }
3858 } else {
1df70142 3859 int i = 0;
b81d288d 3860 while (i < 6)
06b5626a 3861 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3862 }
3863 PL_colorset = 1;
3864}
76234dfb 3865#endif
8615cb43 3866
07be1b83 3867
786e8c11
YO
3868#ifdef TRIE_STUDY_OPT
3869#define CHECK_RESTUDY_GOTO \
3870 if ( \
3871 (data.flags & SCF_TRIE_RESTUDY) \
3872 && ! restudied++ \
3873 ) goto reStudy
3874#else
3875#define CHECK_RESTUDY_GOTO
3876#endif
f9f43