This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
POSIX::remove() and directories
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
e2509266
JH
345#define LOC ((RExC_flags & PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
ccb2c380 373#define FAIL(msg) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
384 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
385 msg, (int)len, RExC_precomp, ellipses); \
386} STMT_END
8615cb43 387
b45f050a 388/*
b45f050a
JF
389 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
390 */
ccb2c380 391#define Simple_vFAIL(m) STMT_START { \
a28509cc 392 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
393 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
394 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
395} STMT_END
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
399 */
ccb2c380
MP
400#define vFAIL(m) STMT_START { \
401 if (!SIZE_ONLY) \
402 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
403 Simple_vFAIL(m); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Like Simple_vFAIL(), but accepts two arguments.
408 */
ccb2c380 409#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 410 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
411 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
412 (int)offset, RExC_precomp, RExC_precomp + offset); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
417 */
ccb2c380
MP
418#define vFAIL2(m,a1) STMT_START { \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
421 Simple_vFAIL2(m, a1); \
422} STMT_END
b45f050a
JF
423
424
425/*
426 * Like Simple_vFAIL(), but accepts three arguments.
427 */
ccb2c380 428#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 429 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
430 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
b45f050a
JF
433
434/*
435 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
436 */
ccb2c380
MP
437#define vFAIL3(m,a1,a2) STMT_START { \
438 if (!SIZE_ONLY) \
439 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
440 Simple_vFAIL3(m, a1, a2); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Like Simple_vFAIL(), but accepts four arguments.
445 */
ccb2c380 446#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 447 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
448 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
449 (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
b45f050a 451
ccb2c380 452#define vWARN(loc,m) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
455 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARNdep(loc,m) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
461 "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465
466#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
469 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
475 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
477
478#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
9d1d55b5 489
8615cb43 490
cd439c50 491/* Allow for side effects in s */
ccb2c380
MP
492#define REGC(c,s) STMT_START { \
493 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
494} STMT_END
cd439c50 495
fac92740
MJD
496/* Macros for recording node offsets. 20001227 mjd@plover.com
497 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
498 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
499 * Element 0 holds the number n.
07be1b83 500 * Position is 1 indexed.
fac92740
MJD
501 */
502
ccb2c380
MP
503#define Set_Node_Offset_To_R(node,byte) STMT_START { \
504 if (! SIZE_ONLY) { \
505 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 506 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 507 if((node) < 0) { \
551405c4 508 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
509 } else { \
510 RExC_offsets[2*(node)-1] = (byte); \
511 } \
512 } \
513} STMT_END
514
515#define Set_Node_Offset(node,byte) \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
517#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
518
519#define Set_Node_Length_To_R(node,len) STMT_START { \
520 if (! SIZE_ONLY) { \
521 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 522 __LINE__, (int)(node), (int)(len))); \
ccb2c380 523 if((node) < 0) { \
551405c4 524 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
525 } else { \
526 RExC_offsets[2*(node)] = (len); \
527 } \
528 } \
529} STMT_END
530
531#define Set_Node_Length(node,len) \
532 Set_Node_Length_To_R((node)-RExC_emit_start, len)
533#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
534#define Set_Node_Cur_Length(node) \
535 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
536
537/* Get offsets and lengths */
538#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
539#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
540
07be1b83
YO
541#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
542 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
543 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
544} STMT_END
545
546
547#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
548#define EXPERIMENTAL_INPLACESCAN
549#endif
550
1de06328 551#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 552DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 553 PerlIO_printf(Perl_debug_log, \
b515a41d 554 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
555 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
556 (int)(depth)*2, "", \
557 (IV)((data)->pos_min), \
558 (IV)((data)->pos_delta), \
559 (IV)((data)->flags), \
560 (IV)((data)->whilem_c), \
561 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
562 ); \
563 if ((data)->last_found) \
564 PerlIO_printf(Perl_debug_log, \
565 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
566 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
567 SvPVX_const((data)->last_found), \
568 (IV)((data)->last_end), \
569 (IV)((data)->last_start_min), \
570 (IV)((data)->last_start_max), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
573 SvPVX_const((data)->longest_fixed), \
574 (IV)((data)->offset_fixed), \
575 ((data)->longest && \
576 (data)->longest==&((data)->longest_float)) ? "*" : "", \
577 SvPVX_const((data)->longest_float), \
578 (IV)((data)->offset_float_min), \
579 (IV)((data)->offset_float_max) \
580 ); \
581 PerlIO_printf(Perl_debug_log,"\n"); \
582});
583
acfe0abc 584static void clear_re(pTHX_ void *r);
4327152a 585
653099ff 586/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 587 Update the longest found anchored substring and the longest found
653099ff
GS
588 floating substrings if needed. */
589
4327152a 590STATIC void
1de06328 591S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 592{
e1ec3a88
AL
593 const STRLEN l = CHR_SVLEN(data->last_found);
594 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 595 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 596
c277df42 597 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 598 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
599 if (*data->longest == data->longest_fixed) {
600 data->offset_fixed = l ? data->last_start_min : data->pos_min;
601 if (data->flags & SF_BEFORE_EOL)
b81d288d 602 data->flags
c277df42
IZ
603 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
604 else
605 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
606 data->minlen_fixed=minlenp;
607 data->lookbehind_fixed=0;
a0ed51b3
LW
608 }
609 else {
c277df42 610 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
611 data->offset_float_max = (l
612 ? data->last_start_max
c277df42 613 : data->pos_min + data->pos_delta);
9051bda5
HS
614 if ((U32)data->offset_float_max > (U32)I32_MAX)
615 data->offset_float_max = I32_MAX;
c277df42 616 if (data->flags & SF_BEFORE_EOL)
b81d288d 617 data->flags
c277df42
IZ
618 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
619 else
620 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
621 data->minlen_float=minlenp;
622 data->lookbehind_float=0;
c277df42
IZ
623 }
624 }
625 SvCUR_set(data->last_found, 0);
0eda9292 626 {
a28509cc 627 SV * const sv = data->last_found;
097eb12c
AL
628 if (SvUTF8(sv) && SvMAGICAL(sv)) {
629 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
630 if (mg)
631 mg->mg_len = 0;
632 }
0eda9292 633 }
c277df42
IZ
634 data->last_end = -1;
635 data->flags &= ~SF_BEFORE_EOL;
1de06328 636 DEBUG_STUDYDATA(data,0);
c277df42
IZ
637}
638
653099ff
GS
639/* Can match anything (initialization) */
640STATIC void
097eb12c 641S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 642{
653099ff 643 ANYOF_CLASS_ZERO(cl);
f8bef550 644 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 645 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
646 if (LOC)
647 cl->flags |= ANYOF_LOCALE;
648}
649
650/* Can match anything (initialization) */
651STATIC int
5f66b61c 652S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
653{
654 int value;
655
aaa51d5e 656 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
657 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
658 return 1;
1aa99e6b
IH
659 if (!(cl->flags & ANYOF_UNICODE_ALL))
660 return 0;
10edeb5d 661 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 662 return 0;
653099ff
GS
663 return 1;
664}
665
666/* Can match anything (initialization) */
667STATIC void
097eb12c 668S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 669{
8ecf7187 670 Zero(cl, 1, struct regnode_charclass_class);
653099ff 671 cl->type = ANYOF;
830247a4 672 cl_anything(pRExC_state, cl);
653099ff
GS
673}
674
675STATIC void
097eb12c 676S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 677{
8ecf7187 678 Zero(cl, 1, struct regnode_charclass_class);
653099ff 679 cl->type = ANYOF;
830247a4 680 cl_anything(pRExC_state, cl);
653099ff
GS
681 if (LOC)
682 cl->flags |= ANYOF_LOCALE;
683}
684
685/* 'And' a given class with another one. Can create false positives */
686/* We assume that cl is not inverted */
687STATIC void
5f66b61c 688S_cl_and(struct regnode_charclass_class *cl,
a28509cc 689 const struct regnode_charclass_class *and_with)
653099ff 690{
40d049e4
YO
691
692 assert(and_with->type == ANYOF);
653099ff
GS
693 if (!(and_with->flags & ANYOF_CLASS)
694 && !(cl->flags & ANYOF_CLASS)
695 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
696 && !(and_with->flags & ANYOF_FOLD)
697 && !(cl->flags & ANYOF_FOLD)) {
698 int i;
699
700 if (and_with->flags & ANYOF_INVERT)
701 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
702 cl->bitmap[i] &= ~and_with->bitmap[i];
703 else
704 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
705 cl->bitmap[i] &= and_with->bitmap[i];
706 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
707 if (!(and_with->flags & ANYOF_EOS))
708 cl->flags &= ~ANYOF_EOS;
1aa99e6b 709
14ebb1a2
JH
710 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
711 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
712 cl->flags &= ~ANYOF_UNICODE_ALL;
713 cl->flags |= ANYOF_UNICODE;
714 ARG_SET(cl, ARG(and_with));
715 }
14ebb1a2
JH
716 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
717 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 718 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
719 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
720 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 721 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
722}
723
724/* 'OR' a given class with another one. Can create false positives */
725/* We assume that cl is not inverted */
726STATIC void
097eb12c 727S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 728{
653099ff
GS
729 if (or_with->flags & ANYOF_INVERT) {
730 /* We do not use
731 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
732 * <= (B1 | !B2) | (CL1 | !CL2)
733 * which is wasteful if CL2 is small, but we ignore CL2:
734 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
735 * XXXX Can we handle case-fold? Unclear:
736 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
737 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
738 */
739 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
740 && !(or_with->flags & ANYOF_FOLD)
741 && !(cl->flags & ANYOF_FOLD) ) {
742 int i;
743
744 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
745 cl->bitmap[i] |= ~or_with->bitmap[i];
746 } /* XXXX: logic is complicated otherwise */
747 else {
830247a4 748 cl_anything(pRExC_state, cl);
653099ff
GS
749 }
750 } else {
751 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 753 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
754 || (cl->flags & ANYOF_FOLD)) ) {
755 int i;
756
757 /* OR char bitmap and class bitmap separately */
758 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
759 cl->bitmap[i] |= or_with->bitmap[i];
760 if (or_with->flags & ANYOF_CLASS) {
761 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
762 cl->classflags[i] |= or_with->classflags[i];
763 cl->flags |= ANYOF_CLASS;
764 }
765 }
766 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 767 cl_anything(pRExC_state, cl);
653099ff
GS
768 }
769 }
770 if (or_with->flags & ANYOF_EOS)
771 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
772
773 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
774 ARG(cl) != ARG(or_with)) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
778 if (or_with->flags & ANYOF_UNICODE_ALL) {
779 cl->flags |= ANYOF_UNICODE_ALL;
780 cl->flags &= ~ANYOF_UNICODE;
781 }
653099ff
GS
782}
783
a3621e74
YO
784#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
785#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
786#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
787#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
788
3dab1dad
YO
789
790#ifdef DEBUGGING
07be1b83 791/*
3dab1dad
YO
792 dump_trie(trie)
793 dump_trie_interim_list(trie,next_alloc)
794 dump_trie_interim_table(trie,next_alloc)
795
796 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
797 The _interim_ variants are used for debugging the interim
798 tables that are used to generate the final compressed
799 representation which is what dump_trie expects.
800
3dab1dad
YO
801 Part of the reason for their existance is to provide a form
802 of documentation as to how the different representations function.
07be1b83
YO
803
804*/
3dab1dad
YO
805
806/*
807 dump_trie(trie)
808 Dumps the final compressed table form of the trie to Perl_debug_log.
809 Used for debugging make_trie().
810*/
811
812STATIC void
813S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
814{
815 U32 state;
ab3bbdeb
YO
816 SV *sv=sv_newmortal();
817 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
818 GET_RE_DEBUG_FLAGS_DECL;
819
ab3bbdeb 820
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
822 (int)depth * 2 + 2,"",
823 "Match","Base","Ofs" );
824
825 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 826 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 827 if ( tmp ) {
ab3bbdeb
YO
828 PerlIO_printf( Perl_debug_log, "%*s",
829 colwidth,
ddc5bc0f 830 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
831 PL_colors[0], PL_colors[1],
832 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
833 PERL_PV_ESCAPE_FIRSTCHAR
834 )
835 );
3dab1dad
YO
836 }
837 }
838 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
839 (int)depth * 2 + 2,"");
840
841 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 842 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
843 PerlIO_printf( Perl_debug_log, "\n");
844
1e2e3d02 845 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 846 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
847
848 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
849
850 if ( trie->states[ state ].wordnum ) {
851 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
852 } else {
853 PerlIO_printf( Perl_debug_log, "%6s", "" );
854 }
855
856 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
857
858 if ( base ) {
859 U32 ofs = 0;
860
861 while( ( base + ofs < trie->uniquecharcount ) ||
862 ( base + ofs - trie->uniquecharcount < trie->lasttrans
863 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
864 ofs++;
865
866 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
867
868 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
869 if ( ( base + ofs >= trie->uniquecharcount ) &&
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
871 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
872 {
ab3bbdeb
YO
873 PerlIO_printf( Perl_debug_log, "%*"UVXf,
874 colwidth,
3dab1dad
YO
875 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
876 } else {
ab3bbdeb 877 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
878 }
879 }
880
881 PerlIO_printf( Perl_debug_log, "]");
882
883 }
884 PerlIO_printf( Perl_debug_log, "\n" );
885 }
886}
887/*
888 dump_trie_interim_list(trie,next_alloc)
889 Dumps a fully constructed but uncompressed trie in list form.
890 List tries normally only are used for construction when the number of
891 possible chars (trie->uniquecharcount) is very high.
892 Used for debugging make_trie().
893*/
894STATIC void
895S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
896{
897 U32 state;
ab3bbdeb
YO
898 SV *sv=sv_newmortal();
899 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
900 GET_RE_DEBUG_FLAGS_DECL;
901 /* print out the table precompression. */
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
903 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
904 "------:-----+-----------------\n" );
3dab1dad
YO
905
906 for( state=1 ; state < next_alloc ; state ++ ) {
907 U16 charid;
908
ab3bbdeb 909 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
910 (int)depth * 2 + 2,"", (UV)state );
911 if ( ! trie->states[ state ].wordnum ) {
912 PerlIO_printf( Perl_debug_log, "%5s| ","");
913 } else {
914 PerlIO_printf( Perl_debug_log, "W%4x| ",
915 trie->states[ state ].wordnum
916 );
917 }
918 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 919 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
920 if ( tmp ) {
921 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
922 colwidth,
ddc5bc0f 923 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
924 PL_colors[0], PL_colors[1],
925 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
926 PERL_PV_ESCAPE_FIRSTCHAR
927 ) ,
1e2e3d02
YO
928 TRIE_LIST_ITEM(state,charid).forid,
929 (UV)TRIE_LIST_ITEM(state,charid).newstate
930 );
931 if (!(charid % 10))
664e119d
RGS
932 PerlIO_printf(Perl_debug_log, "\n%*s| ",
933 (int)((depth * 2) + 14), "");
1e2e3d02 934 }
ab3bbdeb
YO
935 }
936 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
937 }
938}
939
940/*
941 dump_trie_interim_table(trie,next_alloc)
942 Dumps a fully constructed but uncompressed trie in table form.
943 This is the normal DFA style state transition table, with a few
944 twists to facilitate compression later.
945 Used for debugging make_trie().
946*/
947STATIC void
948S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
949{
950 U32 state;
951 U16 charid;
ab3bbdeb
YO
952 SV *sv=sv_newmortal();
953 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
954 GET_RE_DEBUG_FLAGS_DECL;
955
956 /*
957 print out the table precompression so that we can do a visual check
958 that they are identical.
959 */
960
961 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
962
963 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 964 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 965 if ( tmp ) {
ab3bbdeb
YO
966 PerlIO_printf( Perl_debug_log, "%*s",
967 colwidth,
ddc5bc0f 968 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
969 PL_colors[0], PL_colors[1],
970 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
971 PERL_PV_ESCAPE_FIRSTCHAR
972 )
973 );
3dab1dad
YO
974 }
975 }
976
977 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
978
979 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 980 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
981 }
982
983 PerlIO_printf( Perl_debug_log, "\n" );
984
985 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
986
987 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
988 (int)depth * 2 + 2,"",
989 (UV)TRIE_NODENUM( state ) );
990
991 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
992 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
993 if (v)
994 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
995 else
996 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
997 }
998 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1000 } else {
1001 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1002 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1003 }
1004 }
07be1b83 1005}
3dab1dad
YO
1006
1007#endif
1008
786e8c11
YO
1009/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1010 startbranch: the first branch in the whole branch sequence
1011 first : start branch of sequence of branch-exact nodes.
1012 May be the same as startbranch
1013 last : Thing following the last branch.
1014 May be the same as tail.
1015 tail : item following the branch sequence
1016 count : words in the sequence
1017 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1018 depth : indent depth
3dab1dad 1019
786e8c11 1020Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1021
786e8c11
YO
1022A trie is an N'ary tree where the branches are determined by digital
1023decomposition of the key. IE, at the root node you look up the 1st character and
1024follow that branch repeat until you find the end of the branches. Nodes can be
1025marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1026
786e8c11 1027 /he|she|his|hers/
72f13be8 1028
786e8c11
YO
1029would convert into the following structure. Numbers represent states, letters
1030following numbers represent valid transitions on the letter from that state, if
1031the number is in square brackets it represents an accepting state, otherwise it
1032will be in parenthesis.
07be1b83 1033
786e8c11
YO
1034 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1035 | |
1036 | (2)
1037 | |
1038 (1) +-i->(6)-+-s->[7]
1039 |
1040 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1041
786e8c11
YO
1042 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1043
1044This shows that when matching against the string 'hers' we will begin at state 1
1045read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1046then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1047is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1048single traverse. We store a mapping from accepting to state to which word was
1049matched, and then when we have multiple possibilities we try to complete the
1050rest of the regex in the order in which they occured in the alternation.
1051
1052The only prior NFA like behaviour that would be changed by the TRIE support is
1053the silent ignoring of duplicate alternations which are of the form:
1054
1055 / (DUPE|DUPE) X? (?{ ... }) Y /x
1056
1057Thus EVAL blocks follwing a trie may be called a different number of times with
1058and without the optimisation. With the optimisations dupes will be silently
1059ignored. This inconsistant behaviour of EVAL type nodes is well established as
1060the following demonstrates:
1061
1062 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1063
1064which prints out 'word' three times, but
1065
1066 'words'=~/(word|word|word)(?{ print $1 })S/
1067
1068which doesnt print it out at all. This is due to other optimisations kicking in.
1069
1070Example of what happens on a structural level:
1071
1072The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1073
1074 1: CURLYM[1] {1,32767}(18)
1075 5: BRANCH(8)
1076 6: EXACT <ac>(16)
1077 8: BRANCH(11)
1078 9: EXACT <ad>(16)
1079 11: BRANCH(14)
1080 12: EXACT <ab>(16)
1081 16: SUCCEED(0)
1082 17: NOTHING(18)
1083 18: END(0)
1084
1085This would be optimizable with startbranch=5, first=5, last=16, tail=16
1086and should turn into:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: TRIE(16)
1090 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1091 <ac>
1092 <ad>
1093 <ab>
1094 16: SUCCEED(0)
1095 17: NOTHING(18)
1096 18: END(0)
1097
1098Cases where tail != last would be like /(?foo|bar)baz/:
1099
1100 1: BRANCH(4)
1101 2: EXACT <foo>(8)
1102 4: BRANCH(7)
1103 5: EXACT <bar>(8)
1104 7: TAIL(8)
1105 8: EXACT <baz>(10)
1106 10: END(0)
1107
1108which would be optimizable with startbranch=1, first=1, last=7, tail=8
1109and would end up looking like:
1110
1111 1: TRIE(8)
1112 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1113 <foo>
1114 <bar>
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119 d = uvuni_to_utf8_flags(d, uv, 0);
1120
1121is the recommended Unicode-aware way of saying
1122
1123 *(d++) = uv;
1124*/
1125
1e2e3d02 1126#define TRIE_STORE_REVCHAR \
786e8c11 1127 STMT_START { \
1e2e3d02 1128 SV *tmp = newSVpvs(""); \
786e8c11 1129 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1130 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1131 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1132 } STMT_END
1133
1134#define TRIE_READ_CHAR STMT_START { \
1135 wordlen++; \
1136 if ( UTF ) { \
1137 if ( folder ) { \
1138 if ( foldlen > 0 ) { \
1139 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1140 foldlen -= len; \
1141 scan += len; \
1142 len = 0; \
1143 } else { \
1144 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1145 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1146 foldlen -= UNISKIP( uvc ); \
1147 scan = foldbuf + UNISKIP( uvc ); \
1148 } \
1149 } else { \
1150 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1151 } \
1152 } else { \
1153 uvc = (U32)*uc; \
1154 len = 1; \
1155 } \
1156} STMT_END
1157
1158
1159
1160#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1161 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1162 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1163 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1164 } \
1165 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1166 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1167 TRIE_LIST_CUR( state )++; \
1168} STMT_END
07be1b83 1169
786e8c11
YO
1170#define TRIE_LIST_NEW(state) STMT_START { \
1171 Newxz( trie->states[ state ].trans.list, \
1172 4, reg_trie_trans_le ); \
1173 TRIE_LIST_CUR( state ) = 1; \
1174 TRIE_LIST_LEN( state ) = 4; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_HANDLE_WORD(state) STMT_START { \
1178 U16 dupe= trie->states[ state ].wordnum; \
1179 regnode * const noper_next = regnext( noper ); \
1180 \
1181 if (trie->wordlen) \
1182 trie->wordlen[ curword ] = wordlen; \
1183 DEBUG_r({ \
1184 /* store the word for dumping */ \
1185 SV* tmp; \
1186 if (OP(noper) != NOTHING) \
1187 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1188 else \
1189 tmp = newSVpvn( "", 0 ); \
1190 if ( UTF ) SvUTF8_on( tmp ); \
1191 av_push( trie->words, tmp ); \
1192 }); \
1193 \
1194 curword++; \
1195 \
1196 if ( noper_next < tail ) { \
1197 if (!trie->jump) \
1198 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1199 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1200 if (!jumper) \
1201 jumper = noper_next; \
1202 if (!nextbranch) \
1203 nextbranch= regnext(cur); \
1204 } \
1205 \
1206 if ( dupe ) { \
1207 /* So it's a dupe. This means we need to maintain a */\
1208 /* linked-list from the first to the next. */\
1209 /* we only allocate the nextword buffer when there */\
1210 /* a dupe, so first time we have to do the allocation */\
1211 if (!trie->nextword) \
1212 Newxz( trie->nextword, word_count + 1, U16); \
1213 while ( trie->nextword[dupe] ) \
1214 dupe= trie->nextword[dupe]; \
1215 trie->nextword[dupe]= curword; \
1216 } else { \
1217 /* we haven't inserted this word yet. */ \
1218 trie->states[ state ].wordnum = curword; \
1219 } \
1220} STMT_END
07be1b83 1221
3dab1dad 1222
786e8c11
YO
1223#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1224 ( ( base + charid >= ucharcount \
1225 && base + charid < ubound \
1226 && state == trie->trans[ base - ucharcount + charid ].check \
1227 && trie->trans[ base - ucharcount + charid ].next ) \
1228 ? trie->trans[ base - ucharcount + charid ].next \
1229 : ( state==1 ? special : 0 ) \
1230 )
3dab1dad 1231
786e8c11
YO
1232#define MADE_TRIE 1
1233#define MADE_JUMP_TRIE 2
1234#define MADE_EXACT_TRIE 4
3dab1dad 1235
a3621e74 1236STATIC I32
786e8c11 1237S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1238{
27da23d5 1239 dVAR;
a3621e74
YO
1240 /* first pass, loop through and scan words */
1241 reg_trie_data *trie;
1242 regnode *cur;
9f7f3913 1243 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1244 STRLEN len = 0;
1245 UV uvc = 0;
1246 U16 curword = 0;
1247 U32 next_alloc = 0;
786e8c11
YO
1248 regnode *jumper = NULL;
1249 regnode *nextbranch = NULL;
7f69552c 1250 regnode *convert = NULL;
a3621e74 1251 /* we just use folder as a flag in utf8 */
e1ec3a88 1252 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1253 ? PL_fold
1254 : ( flags == EXACTFL
1255 ? PL_fold_locale
1256 : NULL
1257 )
1258 );
1259
e1ec3a88 1260 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1261 SV *re_trie_maxbuff;
3dab1dad
YO
1262#ifndef DEBUGGING
1263 /* these are only used during construction but are useful during
8e11feef 1264 * debugging so we store them in the struct when debugging.
8e11feef 1265 */
3dab1dad 1266 STRLEN trie_charcount=0;
3dab1dad
YO
1267 AV *trie_revcharmap;
1268#endif
a3621e74 1269 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1270#ifndef DEBUGGING
1271 PERL_UNUSED_ARG(depth);
1272#endif
a3621e74 1273
a02a5408 1274 Newxz( trie, 1, reg_trie_data );
a3621e74 1275 trie->refcount = 1;
3dab1dad 1276 trie->startstate = 1;
786e8c11 1277 trie->wordcount = word_count;
a3621e74 1278 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1279 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1280 if (!(UTF && folder))
1281 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1282 DEBUG_r({
1283 trie->words = newAV();
a3621e74 1284 });
3dab1dad 1285 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1286
0111c4fd 1287 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1288 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1289 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1290 }
3dab1dad
YO
1291 DEBUG_OPTIMISE_r({
1292 PerlIO_printf( Perl_debug_log,
786e8c11 1293 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1294 (int)depth * 2 + 2, "",
1295 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1296 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1297 (int)depth);
3dab1dad 1298 });
7f69552c
YO
1299
1300 /* Find the node we are going to overwrite */
1301 if ( first == startbranch && OP( last ) != BRANCH ) {
1302 /* whole branch chain */
1303 convert = first;
1304 } else {
1305 /* branch sub-chain */
1306 convert = NEXTOPER( first );
1307 }
1308
a3621e74
YO
1309 /* -- First loop and Setup --
1310
1311 We first traverse the branches and scan each word to determine if it
1312 contains widechars, and how many unique chars there are, this is
1313 important as we have to build a table with at least as many columns as we
1314 have unique chars.
1315
1316 We use an array of integers to represent the character codes 0..255
1317 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1318 native representation of the character value as the key and IV's for the
1319 coded index.
1320
1321 *TODO* If we keep track of how many times each character is used we can
1322 remap the columns so that the table compression later on is more
1323 efficient in terms of memory by ensuring most common value is in the
1324 middle and the least common are on the outside. IMO this would be better
1325 than a most to least common mapping as theres a decent chance the most
1326 common letter will share a node with the least common, meaning the node
1327 will not be compressable. With a middle is most common approach the worst
1328 case is when we have the least common nodes twice.
1329
1330 */
1331
a3621e74 1332 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1333 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1334 const U8 *uc = (U8*)STRING( noper );
a28509cc 1335 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1336 STRLEN foldlen = 0;
1337 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1338 const U8 *scan = (U8*)NULL;
07be1b83 1339 U32 wordlen = 0; /* required init */
3dab1dad 1340 STRLEN chars=0;
a3621e74 1341
3dab1dad
YO
1342 if (OP(noper) == NOTHING) {
1343 trie->minlen= 0;
1344 continue;
1345 }
1346 if (trie->bitmap) {
1347 TRIE_BITMAP_SET(trie,*uc);
1348 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1349 }
a3621e74 1350 for ( ; uc < e ; uc += len ) {
3dab1dad 1351 TRIE_CHARCOUNT(trie)++;
a3621e74 1352 TRIE_READ_CHAR;
3dab1dad 1353 chars++;
a3621e74
YO
1354 if ( uvc < 256 ) {
1355 if ( !trie->charmap[ uvc ] ) {
1356 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1357 if ( folder )
1358 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1359 TRIE_STORE_REVCHAR;
a3621e74
YO
1360 }
1361 } else {
1362 SV** svpp;
1363 if ( !trie->widecharmap )
1364 trie->widecharmap = newHV();
1365
1366 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1367
1368 if ( !svpp )
e4584336 1369 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1370
1371 if ( !SvTRUE( *svpp ) ) {
1372 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1373 TRIE_STORE_REVCHAR;
a3621e74
YO
1374 }
1375 }
1376 }
3dab1dad
YO
1377 if( cur == first ) {
1378 trie->minlen=chars;
1379 trie->maxlen=chars;
1380 } else if (chars < trie->minlen) {
1381 trie->minlen=chars;
1382 } else if (chars > trie->maxlen) {
1383 trie->maxlen=chars;
1384 }
1385
a3621e74
YO
1386 } /* end first pass */
1387 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1388 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1389 (int)depth * 2 + 2,"",
85c3142d 1390 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1391 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1392 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1393 );
786e8c11 1394 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1395
1396 /*
1397 We now know what we are dealing with in terms of unique chars and
1398 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1399 representation using a flat table will take. If it's over a reasonable
1400 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1401 conservative but potentially much slower representation using an array
1402 of lists.
1403
1404 At the end we convert both representations into the same compressed
1405 form that will be used in regexec.c for matching with. The latter
1406 is a form that cannot be used to construct with but has memory
1407 properties similar to the list form and access properties similar
1408 to the table form making it both suitable for fast searches and
1409 small enough that its feasable to store for the duration of a program.
1410
1411 See the comment in the code where the compressed table is produced
1412 inplace from the flat tabe representation for an explanation of how
1413 the compression works.
1414
1415 */
1416
1417
3dab1dad 1418 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1419 /*
1420 Second Pass -- Array Of Lists Representation
1421
1422 Each state will be represented by a list of charid:state records
1423 (reg_trie_trans_le) the first such element holds the CUR and LEN
1424 points of the allocated array. (See defines above).
1425
1426 We build the initial structure using the lists, and then convert
1427 it into the compressed table form which allows faster lookups
1428 (but cant be modified once converted).
a3621e74
YO
1429 */
1430
a3621e74
YO
1431 STRLEN transcount = 1;
1432
1e2e3d02
YO
1433 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1434 "%*sCompiling trie using list compiler\n",
1435 (int)depth * 2 + 2, ""));
1436
3dab1dad 1437 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1438 TRIE_LIST_NEW(1);
1439 next_alloc = 2;
1440
1441 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1442
c445ea15
AL
1443 regnode * const noper = NEXTOPER( cur );
1444 U8 *uc = (U8*)STRING( noper );
1445 const U8 * const e = uc + STR_LEN( noper );
1446 U32 state = 1; /* required init */
1447 U16 charid = 0; /* sanity init */
1448 U8 *scan = (U8*)NULL; /* sanity init */
1449 STRLEN foldlen = 0; /* required init */
07be1b83 1450 U32 wordlen = 0; /* required init */
c445ea15
AL
1451 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1452
3dab1dad 1453 if (OP(noper) != NOTHING) {
786e8c11 1454 for ( ; uc < e ; uc += len ) {
c445ea15 1455
786e8c11 1456 TRIE_READ_CHAR;
c445ea15 1457
786e8c11
YO
1458 if ( uvc < 256 ) {
1459 charid = trie->charmap[ uvc ];
c445ea15 1460 } else {
786e8c11
YO
1461 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1462 if ( !svpp ) {
1463 charid = 0;
1464 } else {
1465 charid=(U16)SvIV( *svpp );
1466 }
c445ea15 1467 }
786e8c11
YO
1468 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1469 if ( charid ) {
a3621e74 1470
786e8c11
YO
1471 U16 check;
1472 U32 newstate = 0;
a3621e74 1473
786e8c11
YO
1474 charid--;
1475 if ( !trie->states[ state ].trans.list ) {
1476 TRIE_LIST_NEW( state );
c445ea15 1477 }
786e8c11
YO
1478 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1479 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1480 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1481 break;
1482 }
1483 }
1484 if ( ! newstate ) {
1485 newstate = next_alloc++;
1486 TRIE_LIST_PUSH( state, charid, newstate );
1487 transcount++;
1488 }
1489 state = newstate;
1490 } else {
1491 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1492 }
a28509cc 1493 }
c445ea15 1494 }
3dab1dad 1495 TRIE_HANDLE_WORD(state);
a3621e74
YO
1496
1497 } /* end second pass */
1498
1e2e3d02
YO
1499 /* next alloc is the NEXT state to be allocated */
1500 trie->statecount = next_alloc;
a3621e74
YO
1501 Renew( trie->states, next_alloc, reg_trie_state );
1502
3dab1dad
YO
1503 /* and now dump it out before we compress it */
1504 DEBUG_TRIE_COMPILE_MORE_r(
1505 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1506 );
a3621e74 1507
a02a5408 1508 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1509 {
1510 U32 state;
a3621e74
YO
1511 U32 tp = 0;
1512 U32 zp = 0;
1513
1514
1515 for( state=1 ; state < next_alloc ; state ++ ) {
1516 U32 base=0;
1517
1518 /*
1519 DEBUG_TRIE_COMPILE_MORE_r(
1520 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1521 );
1522 */
1523
1524 if (trie->states[state].trans.list) {
1525 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1526 U16 maxid=minid;
a28509cc 1527 U16 idx;
a3621e74
YO
1528
1529 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1530 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1531 if ( forid < minid ) {
1532 minid=forid;
1533 } else if ( forid > maxid ) {
1534 maxid=forid;
1535 }
a3621e74
YO
1536 }
1537 if ( transcount < tp + maxid - minid + 1) {
1538 transcount *= 2;
1539 Renew( trie->trans, transcount, reg_trie_trans );
1540 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1541 }
1542 base = trie->uniquecharcount + tp - minid;
1543 if ( maxid == minid ) {
1544 U32 set = 0;
1545 for ( ; zp < tp ; zp++ ) {
1546 if ( ! trie->trans[ zp ].next ) {
1547 base = trie->uniquecharcount + zp - minid;
1548 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1549 trie->trans[ zp ].check = state;
1550 set = 1;
1551 break;
1552 }
1553 }
1554 if ( !set ) {
1555 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ tp ].check = state;
1557 tp++;
1558 zp = tp;
1559 }
1560 } else {
1561 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1562 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1563 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1564 trie->trans[ tid ].check = state;
1565 }
1566 tp += ( maxid - minid + 1 );
1567 }
1568 Safefree(trie->states[ state ].trans.list);
1569 }
1570 /*
1571 DEBUG_TRIE_COMPILE_MORE_r(
1572 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1573 );
1574 */
1575 trie->states[ state ].trans.base=base;
1576 }
cc601c31 1577 trie->lasttrans = tp + 1;
a3621e74
YO
1578 }
1579 } else {
1580 /*
1581 Second Pass -- Flat Table Representation.
1582
1583 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1584 We know that we will need Charcount+1 trans at most to store the data
1585 (one row per char at worst case) So we preallocate both structures
1586 assuming worst case.
1587
1588 We then construct the trie using only the .next slots of the entry
1589 structs.
1590
1591 We use the .check field of the first entry of the node temporarily to
1592 make compression both faster and easier by keeping track of how many non
1593 zero fields are in the node.
1594
1595 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1596 transition.
1597
1598 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1599 number representing the first entry of the node, and state as a
1600 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1601 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1602 are 2 entrys per node. eg:
1603
1604 A B A B
1605 1. 2 4 1. 3 7
1606 2. 0 3 3. 0 5
1607 3. 0 0 5. 0 0
1608 4. 0 0 7. 0 0
1609
1610 The table is internally in the right hand, idx form. However as we also
1611 have to deal with the states array which is indexed by nodenum we have to
1612 use TRIE_NODENUM() to convert.
1613
1614 */
1e2e3d02
YO
1615 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1616 "%*sCompiling trie using table compiler\n",
1617 (int)depth * 2 + 2, ""));
3dab1dad
YO
1618
1619 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1620 reg_trie_trans );
3dab1dad 1621 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1622 next_alloc = trie->uniquecharcount + 1;
1623
3dab1dad 1624
a3621e74
YO
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626
c445ea15 1627 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1628 const U8 *uc = (U8*)STRING( noper );
1629 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1630
1631 U32 state = 1; /* required init */
1632
1633 U16 charid = 0; /* sanity init */
1634 U32 accept_state = 0; /* sanity init */
1635 U8 *scan = (U8*)NULL; /* sanity init */
1636
1637 STRLEN foldlen = 0; /* required init */
07be1b83 1638 U32 wordlen = 0; /* required init */
a3621e74
YO
1639 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1640
3dab1dad 1641 if ( OP(noper) != NOTHING ) {
786e8c11 1642 for ( ; uc < e ; uc += len ) {
a3621e74 1643
786e8c11 1644 TRIE_READ_CHAR;
a3621e74 1645
786e8c11
YO
1646 if ( uvc < 256 ) {
1647 charid = trie->charmap[ uvc ];
1648 } else {
1649 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1650 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1651 }
786e8c11
YO
1652 if ( charid ) {
1653 charid--;
1654 if ( !trie->trans[ state + charid ].next ) {
1655 trie->trans[ state + charid ].next = next_alloc;
1656 trie->trans[ state ].check++;
1657 next_alloc += trie->uniquecharcount;
1658 }
1659 state = trie->trans[ state + charid ].next;
1660 } else {
1661 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1662 }
1663 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1664 }
a3621e74 1665 }
3dab1dad
YO
1666 accept_state = TRIE_NODENUM( state );
1667 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1668
1669 } /* end second pass */
1670
3dab1dad
YO
1671 /* and now dump it out before we compress it */
1672 DEBUG_TRIE_COMPILE_MORE_r(
1673 dump_trie_interim_table(trie,next_alloc,depth+1)
1674 );
a3621e74 1675
a3621e74
YO
1676 {
1677 /*
1678 * Inplace compress the table.*
1679
1680 For sparse data sets the table constructed by the trie algorithm will
1681 be mostly 0/FAIL transitions or to put it another way mostly empty.
1682 (Note that leaf nodes will not contain any transitions.)
1683
1684 This algorithm compresses the tables by eliminating most such
1685 transitions, at the cost of a modest bit of extra work during lookup:
1686
1687 - Each states[] entry contains a .base field which indicates the
1688 index in the state[] array wheres its transition data is stored.
1689
1690 - If .base is 0 there are no valid transitions from that node.
1691
1692 - If .base is nonzero then charid is added to it to find an entry in
1693 the trans array.
1694
1695 -If trans[states[state].base+charid].check!=state then the
1696 transition is taken to be a 0/Fail transition. Thus if there are fail
1697 transitions at the front of the node then the .base offset will point
1698 somewhere inside the previous nodes data (or maybe even into a node
1699 even earlier), but the .check field determines if the transition is
1700 valid.
1701
786e8c11 1702 XXX - wrong maybe?
a3621e74
YO
1703 The following process inplace converts the table to the compressed
1704 table: We first do not compress the root node 1,and mark its all its
1705 .check pointers as 1 and set its .base pointer as 1 as well. This
1706 allows to do a DFA construction from the compressed table later, and
1707 ensures that any .base pointers we calculate later are greater than
1708 0.
1709
1710 - We set 'pos' to indicate the first entry of the second node.
1711
1712 - We then iterate over the columns of the node, finding the first and
1713 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1714 and set the .check pointers accordingly, and advance pos
1715 appropriately and repreat for the next node. Note that when we copy
1716 the next pointers we have to convert them from the original
1717 NODEIDX form to NODENUM form as the former is not valid post
1718 compression.
1719
1720 - If a node has no transitions used we mark its base as 0 and do not
1721 advance the pos pointer.
1722
1723 - If a node only has one transition we use a second pointer into the
1724 structure to fill in allocated fail transitions from other states.
1725 This pointer is independent of the main pointer and scans forward
1726 looking for null transitions that are allocated to a state. When it
1727 finds one it writes the single transition into the "hole". If the
786e8c11 1728 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1729
1730 - Once compressed we can Renew/realloc the structures to release the
1731 excess space.
1732
1733 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1734 specifically Fig 3.47 and the associated pseudocode.
1735
1736 demq
1737 */
a3b680e6 1738 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1739 U32 state, charid;
a3621e74 1740 U32 pos = 0, zp=0;
1e2e3d02 1741 trie->statecount = laststate;
a3621e74
YO
1742
1743 for ( state = 1 ; state < laststate ; state++ ) {
1744 U8 flag = 0;
a28509cc
AL
1745 const U32 stateidx = TRIE_NODEIDX( state );
1746 const U32 o_used = trie->trans[ stateidx ].check;
1747 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1748 trie->trans[ stateidx ].check = 0;
1749
1750 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1751 if ( flag || trie->trans[ stateidx + charid ].next ) {
1752 if ( trie->trans[ stateidx + charid ].next ) {
1753 if (o_used == 1) {
1754 for ( ; zp < pos ; zp++ ) {
1755 if ( ! trie->trans[ zp ].next ) {
1756 break;
1757 }
1758 }
1759 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1760 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1761 trie->trans[ zp ].check = state;
1762 if ( ++zp > pos ) pos = zp;
1763 break;
1764 }
1765 used--;
1766 }
1767 if ( !flag ) {
1768 flag = 1;
1769 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1770 }
1771 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1772 trie->trans[ pos ].check = state;
1773 pos++;
1774 }
1775 }
1776 }
cc601c31 1777 trie->lasttrans = pos + 1;
1e2e3d02 1778 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1779 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1780 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1781 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1782 (int)depth * 2 + 2,"",
1783 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1784 (IV)next_alloc,
1785 (IV)pos,
a3621e74
YO
1786 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1787 );
1788
1789 } /* end table compress */
1790 }
1e2e3d02
YO
1791 DEBUG_TRIE_COMPILE_MORE_r(
1792 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1793 (int)depth * 2 + 2, "",
1794 (UV)trie->statecount,
1795 (UV)trie->lasttrans)
1796 );
cc601c31
YO
1797 /* resize the trans array to remove unused space */
1798 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1799
3dab1dad
YO
1800 /* and now dump out the compressed format */
1801 DEBUG_TRIE_COMPILE_r(
1802 dump_trie(trie,depth+1)
1803 );
07be1b83 1804
3dab1dad 1805 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1806 U8 nodetype =(U8)(flags & 0xFF);
1807 char *str=NULL;
786e8c11 1808
07be1b83 1809#ifdef DEBUGGING
e62cc96a 1810 regnode *optimize = NULL;
b57a0404
JH
1811 U32 mjd_offset = 0;
1812 U32 mjd_nodelen = 0;
07be1b83 1813#endif
a3621e74 1814 /*
3dab1dad
YO
1815 This means we convert either the first branch or the first Exact,
1816 depending on whether the thing following (in 'last') is a branch
1817 or not and whther first is the startbranch (ie is it a sub part of
1818 the alternation or is it the whole thing.)
1819 Assuming its a sub part we conver the EXACT otherwise we convert
1820 the whole branch sequence, including the first.
a3621e74 1821 */
3dab1dad 1822 /* Find the node we are going to overwrite */
7f69552c 1823 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1824 /* branch sub-chain */
3dab1dad 1825 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1826 DEBUG_r({
1827 mjd_offset= Node_Offset((convert));
1828 mjd_nodelen= Node_Length((convert));
1829 });
7f69552c
YO
1830 /* whole branch chain */
1831 } else {
1832 DEBUG_r({
1833 const regnode *nop = NEXTOPER( convert );
1834 mjd_offset= Node_Offset((nop));
1835 mjd_nodelen= Node_Length((nop));
1836 });
07be1b83 1837 }
7f69552c 1838
07be1b83
YO
1839 DEBUG_OPTIMISE_r(
1840 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1841 (int)depth * 2 + 2, "",
786e8c11 1842 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1843 );
a3621e74 1844
3dab1dad
YO
1845 /* But first we check to see if there is a common prefix we can
1846 split out as an EXACT and put in front of the TRIE node. */
1847 trie->startstate= 1;
786e8c11 1848 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1849 U32 state;
1e2e3d02 1850 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1851 U32 ofs = 0;
8e11feef
RGS
1852 I32 idx = -1;
1853 U32 count = 0;
1854 const U32 base = trie->states[ state ].trans.base;
a3621e74 1855
3dab1dad 1856 if ( trie->states[state].wordnum )
8e11feef 1857 count = 1;
a3621e74 1858
8e11feef 1859 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1860 if ( ( base + ofs >= trie->uniquecharcount ) &&
1861 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1862 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1863 {
3dab1dad 1864 if ( ++count > 1 ) {
8e11feef 1865 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1866 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1867 if ( state == 1 ) break;
3dab1dad
YO
1868 if ( count == 2 ) {
1869 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1870 DEBUG_OPTIMISE_r(
8e11feef
RGS
1871 PerlIO_printf(Perl_debug_log,
1872 "%*sNew Start State=%"UVuf" Class: [",
1873 (int)depth * 2 + 2, "",
786e8c11 1874 (UV)state));
be8e71aa
YO
1875 if (idx >= 0) {
1876 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1877 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1878
3dab1dad 1879 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1880 if ( folder )
1881 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1882 DEBUG_OPTIMISE_r(
07be1b83 1883 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1884 );
8e11feef
RGS
1885 }
1886 }
1887 TRIE_BITMAP_SET(trie,*ch);
1888 if ( folder )
1889 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1890 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1891 }
1892 idx = ofs;
1893 }
3dab1dad
YO
1894 }
1895 if ( count == 1 ) {
1896 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1897 char *ch = SvPV_nolen( *tmp );
1898 DEBUG_OPTIMISE_r({
1899 SV *sv=sv_newmortal();
8e11feef
RGS
1900 PerlIO_printf( Perl_debug_log,
1901 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1902 (int)depth * 2 + 2, "",
de734bd5
A
1903 (UV)state, (UV)idx,
1904 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1905 PL_colors[0], PL_colors[1],
1906 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1907 PERL_PV_ESCAPE_FIRSTCHAR
1908 )
1909 );
1910 });
3dab1dad
YO
1911 if ( state==1 ) {
1912 OP( convert ) = nodetype;
1913 str=STRING(convert);
1914 STR_LEN(convert)=0;
1915 }
de734bd5
A
1916 while (*ch) {
1917 *str++ = *ch++;
1918 STR_LEN(convert)++;
1919 }
1920
8e11feef 1921 } else {
f9049ba1 1922#ifdef DEBUGGING
8e11feef
RGS
1923 if (state>1)
1924 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1925#endif
8e11feef
RGS
1926 break;
1927 }
1928 }
3dab1dad 1929 if (str) {
8e11feef 1930 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1931 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1932 trie->startstate = state;
07be1b83
YO
1933 trie->minlen -= (state - 1);
1934 trie->maxlen -= (state - 1);
1935 DEBUG_r({
1936 regnode *fix = convert;
de734bd5 1937 U32 word = trie->wordcount;
07be1b83
YO
1938 mjd_nodelen++;
1939 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1940 while( ++fix < n ) {
1941 Set_Node_Offset_Length(fix, 0, 0);
1942 }
de734bd5
A
1943 while (word--) {
1944 SV ** const tmp = av_fetch( trie->words, word, 0 );
1945 if (tmp) {
1946 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1947 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1948 else
1949 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1950 }
1951 }
07be1b83 1952 });
8e11feef
RGS
1953 if (trie->maxlen) {
1954 convert = n;
1955 } else {
3dab1dad 1956 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1957 DEBUG_r(optimize= n);
3dab1dad
YO
1958 }
1959 }
1960 }
a5ca303d
YO
1961 if (!jumper)
1962 jumper = last;
3dab1dad 1963 if ( trie->maxlen ) {
8e11feef
RGS
1964 NEXT_OFF( convert ) = (U16)(tail - convert);
1965 ARG_SET( convert, data_slot );
786e8c11
YO
1966 /* Store the offset to the first unabsorbed branch in
1967 jump[0], which is otherwise unused by the jump logic.
1968 We use this when dumping a trie and during optimisation. */
1969 if (trie->jump)
7f69552c 1970 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1971
786e8c11
YO
1972 /* XXXX */
1973 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1974 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1975 {
1976 OP( convert ) = TRIEC;
1977 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1978 Safefree(trie->bitmap);
1979 trie->bitmap= NULL;
1980 } else
1981 OP( convert ) = TRIE;
a3621e74 1982
3dab1dad
YO
1983 /* store the type in the flags */
1984 convert->flags = nodetype;
a5ca303d
YO
1985 DEBUG_r({
1986 optimize = convert
1987 + NODE_STEP_REGNODE
1988 + regarglen[ OP( convert ) ];
1989 });
1990 /* XXX We really should free up the resource in trie now,
1991 as we won't use them - (which resources?) dmq */
3dab1dad 1992 }
a3621e74 1993 /* needed for dumping*/
e62cc96a 1994 DEBUG_r(if (optimize) {
07be1b83 1995 regnode *opt = convert;
e62cc96a 1996 while ( ++opt < optimize) {
07be1b83
YO
1997 Set_Node_Offset_Length(opt,0,0);
1998 }
786e8c11
YO
1999 /*
2000 Try to clean up some of the debris left after the
2001 optimisation.
a3621e74 2002 */
786e8c11 2003 while( optimize < jumper ) {
07be1b83 2004 mjd_nodelen += Node_Length((optimize));
a3621e74 2005 OP( optimize ) = OPTIMIZED;
07be1b83 2006 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2007 optimize++;
2008 }
07be1b83 2009 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2010 });
2011 } /* end node insert */
07be1b83 2012#ifndef DEBUGGING
6e8b4190 2013 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2014#endif
786e8c11
YO
2015 return trie->jump
2016 ? MADE_JUMP_TRIE
2017 : trie->startstate>1
2018 ? MADE_EXACT_TRIE
2019 : MADE_TRIE;
2020}
2021
2022STATIC void
2023S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2024{
2025/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2026
2027 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2029 ISBN 0-201-10088-6
2030
2031 We find the fail state for each state in the trie, this state is the longest proper
2032 suffix of the current states 'word' that is also a proper prefix of another word in our
2033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2034 the DFA not to have to restart after its tried and failed a word at a given point, it
2035 simply continues as though it had been matching the other word in the first place.
2036 Consider
2037 'abcdgu'=~/abcdefg|cdgu/
2038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2039 fail, which would bring use to the state representing 'd' in the second word where we would
2040 try 'g' and succeed, prodceding to match 'cdgu'.
2041 */
2042 /* add a fail transition */
2043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2044 U32 *q;
2045 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2046 const U32 numstates = trie->statecount;
786e8c11
YO
2047 const U32 ubound = trie->lasttrans + ucharcount;
2048 U32 q_read = 0;
2049 U32 q_write = 0;
2050 U32 charid;
2051 U32 base = trie->states[ 1 ].trans.base;
2052 U32 *fail;
2053 reg_ac_data *aho;
2054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2055 GET_RE_DEBUG_FLAGS_DECL;
2056#ifndef DEBUGGING
2057 PERL_UNUSED_ARG(depth);
2058#endif
2059
2060
2061 ARG_SET( stclass, data_slot );
2062 Newxz( aho, 1, reg_ac_data );
2063 RExC_rx->data->data[ data_slot ] = (void*)aho;
2064 aho->trie=trie;
2065 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2066 numstates * sizeof(reg_trie_state));
786e8c11
YO
2067 Newxz( q, numstates, U32);
2068 Newxz( aho->fail, numstates, U32 );
2069 aho->refcount = 1;
2070 fail = aho->fail;
2071 /* initialize fail[0..1] to be 1 so that we always have
2072 a valid final fail state */
2073 fail[ 0 ] = fail[ 1 ] = 1;
2074
2075 for ( charid = 0; charid < ucharcount ; charid++ ) {
2076 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2077 if ( newstate ) {
2078 q[ q_write ] = newstate;
2079 /* set to point at the root */
2080 fail[ q[ q_write++ ] ]=1;
2081 }
2082 }
2083 while ( q_read < q_write) {
2084 const U32 cur = q[ q_read++ % numstates ];
2085 base = trie->states[ cur ].trans.base;
2086
2087 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2088 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2089 if (ch_state) {
2090 U32 fail_state = cur;
2091 U32 fail_base;
2092 do {
2093 fail_state = fail[ fail_state ];
2094 fail_base = aho->states[ fail_state ].trans.base;
2095 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2096
2097 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2098 fail[ ch_state ] = fail_state;
2099 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2100 {
2101 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2102 }
2103 q[ q_write++ % numstates] = ch_state;
2104 }
2105 }
2106 }
2107 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2108 when we fail in state 1, this allows us to use the
2109 charclass scan to find a valid start char. This is based on the principle
2110 that theres a good chance the string being searched contains lots of stuff
2111 that cant be a start char.
2112 */
2113 fail[ 0 ] = fail[ 1 ] = 0;
2114 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2115 PerlIO_printf(Perl_debug_log,
2116 "%*sStclass Failtable (%"UVuf" states): 0",
2117 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2118 );
786e8c11
YO
2119 for( q_read=1; q_read<numstates; q_read++ ) {
2120 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2121 }
2122 PerlIO_printf(Perl_debug_log, "\n");
2123 });
2124 Safefree(q);
2125 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2126}
2127
786e8c11 2128
a3621e74 2129/*
5d1c421c
JH
2130 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2131 * These need to be revisited when a newer toolchain becomes available.
2132 */
2133#if defined(__sparc64__) && defined(__GNUC__)
2134# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2135# undef SPARC64_GCC_WORKAROUND
2136# define SPARC64_GCC_WORKAROUND 1
2137# endif
2138#endif
2139
07be1b83 2140#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2141 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2142 SV * const mysv=sv_newmortal(); \
2143 regnode *Next = regnext(scan); \
2144 regprop(RExC_rx, mysv, scan); \
7f69552c 2145 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2146 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2147 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2148 }});
07be1b83 2149
1de06328
YO
2150
2151
2152
2153
07be1b83
YO
2154#define JOIN_EXACT(scan,min,flags) \
2155 if (PL_regkind[OP(scan)] == EXACT) \
2156 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2157
be8e71aa 2158STATIC U32
07be1b83
YO
2159S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2160 /* Merge several consecutive EXACTish nodes into one. */
2161 regnode *n = regnext(scan);
2162 U32 stringok = 1;
2163 regnode *next = scan + NODE_SZ_STR(scan);
2164 U32 merged = 0;
2165 U32 stopnow = 0;
2166#ifdef DEBUGGING
2167 regnode *stop = scan;
72f13be8 2168 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2169#else
d47053eb
RGS
2170 PERL_UNUSED_ARG(depth);
2171#endif
2172#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2173 PERL_UNUSED_ARG(flags);
2174 PERL_UNUSED_ARG(val);
07be1b83 2175#endif
07be1b83
YO
2176 DEBUG_PEEP("join",scan,depth);
2177
2178 /* Skip NOTHING, merge EXACT*. */
2179 while (n &&
2180 ( PL_regkind[OP(n)] == NOTHING ||
2181 (stringok && (OP(n) == OP(scan))))
2182 && NEXT_OFF(n)
2183 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2184
2185 if (OP(n) == TAIL || n > next)
2186 stringok = 0;
2187 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2188 DEBUG_PEEP("skip:",n,depth);
2189 NEXT_OFF(scan) += NEXT_OFF(n);
2190 next = n + NODE_STEP_REGNODE;
2191#ifdef DEBUGGING
2192 if (stringok)
2193 stop = n;
2194#endif
2195 n = regnext(n);
2196 }
2197 else if (stringok) {
786e8c11 2198 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2199 regnode * const nnext = regnext(n);
2200
2201 DEBUG_PEEP("merg",n,depth);
2202
2203 merged++;
2204 if (oldl + STR_LEN(n) > U8_MAX)
2205 break;
2206 NEXT_OFF(scan) += NEXT_OFF(n);
2207 STR_LEN(scan) += STR_LEN(n);
2208 next = n + NODE_SZ_STR(n);
2209 /* Now we can overwrite *n : */
2210 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2211#ifdef DEBUGGING
2212 stop = next - 1;
2213#endif
2214 n = nnext;
2215 if (stopnow) break;
2216 }
2217
d47053eb
RGS
2218#ifdef EXPERIMENTAL_INPLACESCAN
2219 if (flags && !NEXT_OFF(n)) {
2220 DEBUG_PEEP("atch", val, depth);
2221 if (reg_off_by_arg[OP(n)]) {
2222 ARG_SET(n, val - n);
2223 }
2224 else {
2225 NEXT_OFF(n) = val - n;
2226 }
2227 stopnow = 1;
2228 }
07be1b83
YO
2229#endif
2230 }
2231
2232 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2233 /*
2234 Two problematic code points in Unicode casefolding of EXACT nodes:
2235
2236 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2237 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2238
2239 which casefold to
2240
2241 Unicode UTF-8
2242
2243 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2244 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2245
2246 This means that in case-insensitive matching (or "loose matching",
2247 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2248 length of the above casefolded versions) can match a target string
2249 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2250 This would rather mess up the minimum length computation.
2251
2252 What we'll do is to look for the tail four bytes, and then peek
2253 at the preceding two bytes to see whether we need to decrease
2254 the minimum length by four (six minus two).
2255
2256 Thanks to the design of UTF-8, there cannot be false matches:
2257 A sequence of valid UTF-8 bytes cannot be a subsequence of
2258 another valid sequence of UTF-8 bytes.
2259
2260 */
2261 char * const s0 = STRING(scan), *s, *t;
2262 char * const s1 = s0 + STR_LEN(scan) - 1;
2263 char * const s2 = s1 - 4;
e294cc5d
JH
2264#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2265 const char t0[] = "\xaf\x49\xaf\x42";
2266#else
07be1b83 2267 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2268#endif
07be1b83
YO
2269 const char * const t1 = t0 + 3;
2270
2271 for (s = s0 + 2;
2272 s < s2 && (t = ninstr(s, s1, t0, t1));
2273 s = t + 4) {
e294cc5d
JH
2274#ifdef EBCDIC
2275 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2276 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2277#else
07be1b83
YO
2278 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2279 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2280#endif
07be1b83
YO
2281 *min -= 4;
2282 }
2283 }
2284
2285#ifdef DEBUGGING
2286 /* Allow dumping */
2287 n = scan + NODE_SZ_STR(scan);
2288 while (n <= stop) {
2289 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2290 OP(n) = OPTIMIZED;
2291 NEXT_OFF(n) = 0;
2292 }
2293 n++;
2294 }
2295#endif
2296 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2297 return stopnow;
2298}
2299
653099ff
GS
2300/* REx optimizer. Converts nodes into quickier variants "in place".
2301 Finds fixed substrings. */
2302
a0288114 2303/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2304 to the position after last scanned or to NULL. */
2305
40d049e4
YO
2306#define INIT_AND_WITHP \
2307 assert(!and_withp); \
2308 Newx(and_withp,1,struct regnode_charclass_class); \
2309 SAVEFREEPV(and_withp)
07be1b83 2310
b515a41d
YO
2311/* this is a chain of data about sub patterns we are processing that
2312 need to be handled seperately/specially in study_chunk. Its so
2313 we can simulate recursion without losing state. */
2314struct scan_frame;
2315typedef struct scan_frame {
2316 regnode *last; /* last node to process in this frame */
2317 regnode *next; /* next node to process when last is reached */
2318 struct scan_frame *prev; /*previous frame*/
2319 I32 stop; /* what stopparen do we use */
2320} scan_frame;
2321
76e3520e 2322STATIC I32
40d049e4 2323S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2324 I32 *minlenp, I32 *deltap,
40d049e4
YO
2325 regnode *last,
2326 scan_data_t *data,
2327 I32 stopparen,
2328 U8* recursed,
2329 struct regnode_charclass_class *and_withp,
2330 U32 flags, U32 depth)
c277df42
IZ
2331 /* scanp: Start here (read-write). */
2332 /* deltap: Write maxlen-minlen here. */
2333 /* last: Stop before this one. */
40d049e4
YO
2334 /* data: string data about the pattern */
2335 /* stopparen: treat close N as END */
2336 /* recursed: which subroutines have we recursed into */
2337 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2338{
97aff369 2339 dVAR;
c277df42
IZ
2340 I32 min = 0, pars = 0, code;
2341 regnode *scan = *scanp, *next;
2342 I32 delta = 0;
2343 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2344 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2345 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2346 scan_data_t data_fake;
a3621e74 2347 SV *re_trie_maxbuff = NULL;
786e8c11 2348 regnode *first_non_open = scan;
e2e6a0f1 2349 I32 stopmin = I32_MAX;
8aa23a47
YO
2350 scan_frame *frame = NULL;
2351
a3621e74 2352 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2353
13a24bad 2354#ifdef DEBUGGING
40d049e4 2355 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2356#endif
40d049e4 2357
786e8c11 2358 if ( depth == 0 ) {
40d049e4 2359 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2360 first_non_open=regnext(first_non_open);
2361 }
2362
b81d288d 2363
8aa23a47
YO
2364 fake_study_recurse:
2365 while ( scan && OP(scan) != END && scan < last ){
2366 /* Peephole optimizer: */
2367 DEBUG_STUDYDATA(data,depth);
2368 DEBUG_PEEP("Peep",scan,depth);
2369 JOIN_EXACT(scan,&min,0);
2370
2371 /* Follow the next-chain of the current node and optimize
2372 away all the NOTHINGs from it. */
2373 if (OP(scan) != CURLYX) {
2374 const int max = (reg_off_by_arg[OP(scan)]
2375 ? I32_MAX
2376 /* I32 may be smaller than U16 on CRAYs! */
2377 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2378 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2379 int noff;
2380 regnode *n = scan;
2381
2382 /* Skip NOTHING and LONGJMP. */
2383 while ((n = regnext(n))
2384 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2385 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2386 && off + noff < max)
2387 off += noff;
2388 if (reg_off_by_arg[OP(scan)])
2389 ARG(scan) = off;
2390 else
2391 NEXT_OFF(scan) = off;
2392 }
a3621e74 2393
c277df42 2394
8aa23a47
YO
2395
2396 /* The principal pseudo-switch. Cannot be a switch, since we
2397 look into several different things. */
2398 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2399 || OP(scan) == IFTHEN) {
2400 next = regnext(scan);
2401 code = OP(scan);
2402 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2403
2404 if (OP(next) == code || code == IFTHEN) {
2405 /* NOTE - There is similar code to this block below for handling
2406 TRIE nodes on a re-study. If you change stuff here check there
2407 too. */
2408 I32 max1 = 0, min1 = I32_MAX, num = 0;
2409 struct regnode_charclass_class accum;
2410 regnode * const startbranch=scan;
2411
2412 if (flags & SCF_DO_SUBSTR)
2413 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2414 if (flags & SCF_DO_STCLASS)
2415 cl_init_zero(pRExC_state, &accum);
2416
2417 while (OP(scan) == code) {
2418 I32 deltanext, minnext, f = 0, fake;
2419 struct regnode_charclass_class this_class;
2420
2421 num++;
2422 data_fake.flags = 0;
2423 if (data) {
2424 data_fake.whilem_c = data->whilem_c;
2425 data_fake.last_closep = data->last_closep;
2426 }
2427 else
2428 data_fake.last_closep = &fake;
2429 next = regnext(scan);
2430 scan = NEXTOPER(scan);
2431 if (code != BRANCH)
c277df42 2432 scan = NEXTOPER(scan);
8aa23a47
YO
2433 if (flags & SCF_DO_STCLASS) {
2434 cl_init(pRExC_state, &this_class);
2435 data_fake.start_class = &this_class;
2436 f = SCF_DO_STCLASS_AND;
2437 }
2438 if (flags & SCF_WHILEM_VISITED_POS)
2439 f |= SCF_WHILEM_VISITED_POS;
2440
2441 /* we suppose the run is continuous, last=next...*/
2442 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2443 next, &data_fake,
2444 stopparen, recursed, NULL, f,depth+1);
2445 if (min1 > minnext)
2446 min1 = minnext;
2447 if (max1 < minnext + deltanext)
2448 max1 = minnext + deltanext;
2449 if (deltanext == I32_MAX)
2450 is_inf = is_inf_internal = 1;
2451 scan = next;
2452 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2453 pars++;
2454 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2455 if ( stopmin > minnext)
2456 stopmin = min + min1;
2457 flags &= ~SCF_DO_SUBSTR;
2458 if (data)
2459 data->flags |= SCF_SEEN_ACCEPT;
2460 }
2461 if (data) {
2462 if (data_fake.flags & SF_HAS_EVAL)
2463 data->flags |= SF_HAS_EVAL;
2464 data->whilem_c = data_fake.whilem_c;
3dab1dad 2465 }
8aa23a47
YO
2466 if (flags & SCF_DO_STCLASS)
2467 cl_or(pRExC_state, &accum, &this_class);
2468 }
2469 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2470 min1 = 0;
2471 if (flags & SCF_DO_SUBSTR) {
2472 data->pos_min += min1;
2473 data->pos_delta += max1 - min1;
2474 if (max1 != min1 || is_inf)
2475 data->longest = &(data->longest_float);
2476 }
2477 min += min1;
2478 delta += max1 - min1;
2479 if (flags & SCF_DO_STCLASS_OR) {
2480 cl_or(pRExC_state, data->start_class, &accum);
2481 if (min1) {
2482 cl_and(data->start_class, and_withp);
2483 flags &= ~SCF_DO_STCLASS;
653099ff 2484 }
8aa23a47
YO
2485 }
2486 else if (flags & SCF_DO_STCLASS_AND) {
2487 if (min1) {
2488 cl_and(data->start_class, &accum);
2489 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2490 }
8aa23a47
YO
2491 else {
2492 /* Switch to OR mode: cache the old value of
2493 * data->start_class */
2494 INIT_AND_WITHP;
2495 StructCopy(data->start_class, and_withp,
2496 struct regnode_charclass_class);
2497 flags &= ~SCF_DO_STCLASS_AND;
2498 StructCopy(&accum, data->start_class,
2499 struct regnode_charclass_class);
2500 flags |= SCF_DO_STCLASS_OR;
2501 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2502 }
8aa23a47 2503 }
a3621e74 2504
8aa23a47
YO
2505 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2506 /* demq.
a3621e74 2507
8aa23a47
YO
2508 Assuming this was/is a branch we are dealing with: 'scan' now
2509 points at the item that follows the branch sequence, whatever
2510 it is. We now start at the beginning of the sequence and look
2511 for subsequences of
a3621e74 2512
8aa23a47
YO
2513 BRANCH->EXACT=>x1
2514 BRANCH->EXACT=>x2
2515 tail
a3621e74 2516
8aa23a47 2517 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2518
8aa23a47
YO
2519 If we can find such a subseqence we need to turn the first
2520 element into a trie and then add the subsequent branch exact
2521 strings to the trie.
a3621e74 2522
8aa23a47 2523 We have two cases
a3621e74 2524
8aa23a47 2525 1. patterns where the whole set of branch can be converted.
a3621e74 2526
8aa23a47 2527 2. patterns where only a subset can be converted.
a3621e74 2528
8aa23a47
YO
2529 In case 1 we can replace the whole set with a single regop
2530 for the trie. In case 2 we need to keep the start and end
2531 branchs so
a3621e74 2532
8aa23a47
YO
2533 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2534 becomes BRANCH TRIE; BRANCH X;
786e8c11 2535
8aa23a47
YO
2536 There is an additional case, that being where there is a
2537 common prefix, which gets split out into an EXACT like node
2538 preceding the TRIE node.
a3621e74 2539
8aa23a47
YO
2540 If x(1..n)==tail then we can do a simple trie, if not we make
2541 a "jump" trie, such that when we match the appropriate word
2542 we "jump" to the appopriate tail node. Essentailly we turn
2543 a nested if into a case structure of sorts.
b515a41d 2544
8aa23a47
YO
2545 */
2546
2547 int made=0;
2548 if (!re_trie_maxbuff) {
2549 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2550 if (!SvIOK(re_trie_maxbuff))
2551 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2552 }
2553 if ( SvIV(re_trie_maxbuff)>=0 ) {
2554 regnode *cur;
2555 regnode *first = (regnode *)NULL;
2556 regnode *last = (regnode *)NULL;
2557 regnode *tail = scan;
2558 U8 optype = 0;
2559 U32 count=0;
a3621e74
YO
2560
2561#ifdef DEBUGGING
8aa23a47 2562 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2563#endif
8aa23a47
YO
2564 /* var tail is used because there may be a TAIL
2565 regop in the way. Ie, the exacts will point to the
2566 thing following the TAIL, but the last branch will
2567 point at the TAIL. So we advance tail. If we
2568 have nested (?:) we may have to move through several
2569 tails.
2570 */
2571
2572 while ( OP( tail ) == TAIL ) {
2573 /* this is the TAIL generated by (?:) */
2574 tail = regnext( tail );
2575 }
a3621e74 2576
8aa23a47
YO
2577
2578 DEBUG_OPTIMISE_r({
2579 regprop(RExC_rx, mysv, tail );
2580 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2581 (int)depth * 2 + 2, "",
2582 "Looking for TRIE'able sequences. Tail node is: ",
2583 SvPV_nolen_const( mysv )
2584 );
2585 });
2586
2587 /*
2588
2589 step through the branches, cur represents each
2590 branch, noper is the first thing to be matched
2591 as part of that branch and noper_next is the
2592 regnext() of that node. if noper is an EXACT
2593 and noper_next is the same as scan (our current
2594 position in the regex) then the EXACT branch is
2595 a possible optimization target. Once we have
2596 two or more consequetive such branches we can
2597 create a trie of the EXACT's contents and stich
2598 it in place. If the sequence represents all of
2599 the branches we eliminate the whole thing and
2600 replace it with a single TRIE. If it is a
2601 subsequence then we need to stitch it in. This
2602 means the first branch has to remain, and needs
2603 to be repointed at the item on the branch chain
2604 following the last branch optimized. This could
2605 be either a BRANCH, in which case the
2606 subsequence is internal, or it could be the
2607 item following the branch sequence in which
2608 case the subsequence is at the end.
2609
2610 */
2611
2612 /* dont use tail as the end marker for this traverse */
2613 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2614 regnode * const noper = NEXTOPER( cur );
b515a41d 2615#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2616 regnode * const noper_next = regnext( noper );
b515a41d
YO
2617#endif
2618
8aa23a47
YO
2619 DEBUG_OPTIMISE_r({
2620 regprop(RExC_rx, mysv, cur);
2621 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2622 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2623
2624 regprop(RExC_rx, mysv, noper);
2625 PerlIO_printf( Perl_debug_log, " -> %s",
2626 SvPV_nolen_const(mysv));
2627
2628 if ( noper_next ) {
2629 regprop(RExC_rx, mysv, noper_next );
2630 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2631 SvPV_nolen_const(mysv));
2632 }
2633 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2634 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2635 });
2636 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2637 : PL_regkind[ OP( noper ) ] == EXACT )
2638 || OP(noper) == NOTHING )
786e8c11 2639#ifdef NOJUMPTRIE
8aa23a47 2640 && noper_next == tail
786e8c11 2641#endif
8aa23a47
YO
2642 && count < U16_MAX)
2643 {
2644 count++;
2645 if ( !first || optype == NOTHING ) {
2646 if (!first) first = cur;
2647 optype = OP( noper );
2648 } else {
2649 last = cur;
2650 }
2651 } else {
2652 if ( last ) {
2653 make_trie( pRExC_state,
2654 startbranch, first, cur, tail, count,
2655 optype, depth+1 );
2656 }
2657 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2658#ifdef NOJUMPTRIE
8aa23a47 2659 && noper_next == tail
786e8c11 2660#endif
8aa23a47
YO
2661 ){
2662 count = 1;
2663 first = cur;
2664 optype = OP( noper );
2665 } else {
2666 count = 0;
2667 first = NULL;
2668 optype = 0;
2669 }
2670 last = NULL;
2671 }
2672 }
2673 DEBUG_OPTIMISE_r({
2674 regprop(RExC_rx, mysv, cur);
2675 PerlIO_printf( Perl_debug_log,
2676 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2677 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2678
2679 });
2680 if ( last ) {
2681 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2682#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2683 if ( ((made == MADE_EXACT_TRIE &&
2684 startbranch == first)
2685 || ( first_non_open == first )) &&
2686 depth==0 ) {
2687 flags |= SCF_TRIE_RESTUDY;
2688 if ( startbranch == first
2689 && scan == tail )
2690 {
2691 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2692 }
2693 }
3dab1dad 2694#endif
8aa23a47
YO
2695 }
2696 }
2697
2698 } /* do trie */
2699
653099ff 2700 }
8aa23a47
YO
2701 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2702 scan = NEXTOPER(NEXTOPER(scan));
2703 } else /* single branch is optimized. */
2704 scan = NEXTOPER(scan);
2705 continue;
2706 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2707 scan_frame *newframe = NULL;
2708 I32 paren;
2709 regnode *start;
2710 regnode *end;
2711
2712 if (OP(scan) != SUSPEND) {
2713 /* set the pointer */
2714 if (OP(scan) == GOSUB) {
2715 paren = ARG(scan);
2716 RExC_recurse[ARG2L(scan)] = scan;
2717 start = RExC_open_parens[paren-1];
2718 end = RExC_close_parens[paren-1];
2719 } else {
2720 paren = 0;
2721 start = RExC_rx->program + 1;
2722 end = RExC_opend;
2723 }
2724 if (!recursed) {
2725 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2726 SAVEFREEPV(recursed);
2727 }
2728 if (!PAREN_TEST(recursed,paren+1)) {
2729 PAREN_SET(recursed,paren+1);
2730 Newx(newframe,1,scan_frame);
2731 } else {
2732 if (flags & SCF_DO_SUBSTR) {
2733 scan_commit(pRExC_state,data,minlenp);
2734 data->longest = &(data->longest_float);
2735 }
2736 is_inf = is_inf_internal = 1;
2737 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2738 cl_anything(pRExC_state, data->start_class);
2739 flags &= ~SCF_DO_STCLASS;
2740 }
2741 } else {
2742 Newx(newframe,1,scan_frame);
2743 paren = stopparen;
2744 start = scan+2;
2745 end = regnext(scan);
2746 }
2747 if (newframe) {
2748 assert(start);
2749 assert(end);
2750 SAVEFREEPV(newframe);
2751 newframe->next = regnext(scan);
2752 newframe->last = last;
2753 newframe->stop = stopparen;
2754 newframe->prev = frame;
2755
2756 frame = newframe;
2757 scan = start;
2758 stopparen = paren;
2759 last = end;
2760
2761 continue;
2762 }
2763 }
2764 else if (OP(scan) == EXACT) {
2765 I32 l = STR_LEN(scan);
2766 UV uc;
2767 if (UTF) {
2768 const U8 * const s = (U8*)STRING(scan);
2769 l = utf8_length(s, s + l);
2770 uc = utf8_to_uvchr(s, NULL);
2771 } else {
2772 uc = *((U8*)STRING(scan));
2773 }
2774 min += l;
2775 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2776 /* The code below prefers earlier match for fixed
2777 offset, later match for variable offset. */
2778 if (data->last_end == -1) { /* Update the start info. */
2779 data->last_start_min = data->pos_min;
2780 data->last_start_max = is_inf
2781 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2782 }
8aa23a47
YO
2783 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2784 if (UTF)
2785 SvUTF8_on(data->last_found);
2786 {
2787 SV * const sv = data->last_found;
2788 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2789 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2790 if (mg && mg->mg_len >= 0)
2791 mg->mg_len += utf8_length((U8*)STRING(scan),
2792 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2793 }
8aa23a47
YO
2794 data->last_end = data->pos_min + l;
2795 data->pos_min += l; /* As in the first entry. */
2796 data->flags &= ~SF_BEFORE_EOL;
2797 }
2798 if (flags & SCF_DO_STCLASS_AND) {
2799 /* Check whether it is compatible with what we know already! */
2800 int compat = 1;
2801
2802 if (uc >= 0x100 ||
2803 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2804 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2805 && (!(data->start_class->flags & ANYOF_FOLD)
2806 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2807 )
2808 compat = 0;
2809 ANYOF_CLASS_ZERO(data->start_class);
2810 ANYOF_BITMAP_ZERO(data->start_class);
2811 if (compat)
2812 ANYOF_BITMAP_SET(data->start_class, uc);
2813 data->start_class->flags &= ~ANYOF_EOS;
2814 if (uc < 0x100)
2815 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2816 }
2817 else if (flags & SCF_DO_STCLASS_OR) {
2818 /* false positive possible if the class is case-folded */
2819 if (uc < 0x100)
2820 ANYOF_BITMAP_SET(data->start_class, uc);
2821 else
2822 data->start_class->flags |= ANYOF_UNICODE_ALL;
2823 data->start_class->flags &= ~ANYOF_EOS;
2824 cl_and(data->start_class, and_withp);
2825 }
2826 flags &= ~SCF_DO_STCLASS;
2827 }
2828 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2829 I32 l = STR_LEN(scan);
2830 UV uc = *((U8*)STRING(scan));
2831
2832 /* Search for fixed substrings supports EXACT only. */
2833 if (flags & SCF_DO_SUBSTR) {
2834 assert(data);
2835 scan_commit(pRExC_state, data, minlenp);
2836 }
2837 if (UTF) {
2838 const U8 * const s = (U8 *)STRING(scan);
2839 l = utf8_length(s, s + l);
2840 uc = utf8_to_uvchr(s, NULL);
2841 }
2842 min += l;
2843 if (flags & SCF_DO_SUBSTR)
2844 data->pos_min += l;
2845 if (flags & SCF_DO_STCLASS_AND) {
2846 /* Check whether it is compatible with what we know already! */
2847 int compat = 1;
2848
2849 if (uc >= 0x100 ||
2850 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2851 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2852 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2853 compat = 0;
2854 ANYOF_CLASS_ZERO(data->start_class);
2855 ANYOF_BITMAP_ZERO(data->start_class);
2856 if (compat) {
2857 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2858 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2859 data->start_class->flags |= ANYOF_FOLD;
2860 if (OP(scan) == EXACTFL)
2861 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2862 }
8aa23a47
YO
2863 }
2864 else if (flags & SCF_DO_STCLASS_OR) {
2865 if (data->start_class->flags & ANYOF_FOLD) {
2866 /* false positive possible if the class is case-folded.
2867 Assume that the locale settings are the same... */
1aa99e6b
IH
2868 if (uc < 0x100)
2869 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2870 data->start_class->flags &= ~ANYOF_EOS;
2871 }
8aa23a47 2872 cl_and(data->start_class, and_withp);
653099ff 2873 }
8aa23a47
YO
2874 flags &= ~SCF_DO_STCLASS;
2875 }
2876 else if (strchr((const char*)PL_varies,OP(scan))) {
2877 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2878 I32 f = flags, pos_before = 0;
2879 regnode * const oscan = scan;
2880 struct regnode_charclass_class this_class;
2881 struct regnode_charclass_class *oclass = NULL;
2882 I32 next_is_eval = 0;
2883
2884 switch (PL_regkind[OP(scan)]) {
2885 case WHILEM: /* End of (?:...)* . */
2886 scan = NEXTOPER(scan);
2887 goto finish;
2888 case PLUS:
2889 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2890 next = NEXTOPER(scan);
2891 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2892 mincount = 1;
2893 maxcount = REG_INFTY;
2894 next = regnext(scan);
2895 scan = NEXTOPER(scan);
2896 goto do_curly;
2897 }
2898 }
2899 if (flags & SCF_DO_SUBSTR)
2900 data->pos_min++;
2901 min++;
2902 /* Fall through. */
2903 case STAR:
2904 if (flags & SCF_DO_STCLASS) {
2905 mincount = 0;
2906 maxcount = REG_INFTY;
2907 next = regnext(scan);
2908 scan = NEXTOPER(scan);
2909 goto do_curly;
2910 }
2911 is_inf = is_inf_internal = 1;
2912 scan = regnext(scan);
c277df42 2913 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2914 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2915 data->longest = &(data->longest_float);
c277df42 2916 }
8aa23a47
YO
2917 goto optimize_curly_tail;
2918 case CURLY:
2919 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2920 && (scan->flags == stopparen))
2921 {
2922 mincount = 1;
2923 maxcount = 1;
2924 } else {
2925 mincount = ARG1(scan);
2926 maxcount = ARG2(scan);
653099ff 2927 }
8aa23a47
YO
2928 next = regnext(scan);
2929 if (OP(scan) == CURLYX) {
2930 I32 lp = (data ? *(data->last_closep) : 0);
2931 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2932 }
8aa23a47
YO
2933 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2934 next_is_eval = (OP(scan) == EVAL);
2935 do_curly:
2936 if (flags & SCF_DO_SUBSTR) {
2937 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2938 pos_before = data->pos_min;
b45f050a 2939 }
8aa23a47
YO
2940 if (data) {
2941 fl = data->flags;
2942 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2943 if (is_inf)
2944 data->flags |= SF_IS_INF;
2945 }
2946 if (flags & SCF_DO_STCLASS) {
2947 cl_init(pRExC_state, &this_class);
2948 oclass = data->start_class;
2949 data->start_class = &this_class;
2950 f |= SCF_DO_STCLASS_AND;
2951 f &= ~SCF_DO_STCLASS_OR;
2952 }
2953 /* These are the cases when once a subexpression
2954 fails at a particular position, it cannot succeed
2955 even after backtracking at the enclosing scope.
2956
2957 XXXX what if minimal match and we are at the
2958 initial run of {n,m}? */
2959 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2960 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2961
8aa23a47
YO
2962 /* This will finish on WHILEM, setting scan, or on NULL: */
2963 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2964 last, data, stopparen, recursed, NULL,
2965 (mincount == 0
2966 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2967
8aa23a47
YO
2968 if (flags & SCF_DO_STCLASS)
2969 data->start_class = oclass;
2970 if (mincount == 0 || minnext == 0) {
2971 if (flags & SCF_DO_STCLASS_OR) {
2972 cl_or(pRExC_state, data->start_class, &this_class);
2973 }
2974 else if (flags & SCF_DO_STCLASS_AND) {
2975 /* Switch to OR mode: cache the old value of
2976 * data->start_class */
2977 INIT_AND_WITHP;
2978 StructCopy(data->start_class, and_withp,
2979 struct regnode_charclass_class);
2980 flags &= ~SCF_DO_STCLASS_AND;
2981 StructCopy(&this_class, data->start_class,
2982 struct regnode_charclass_class);
2983 flags |= SCF_DO_STCLASS_OR;
2984 data->start_class->flags |= ANYOF_EOS;
2985 }
2986 } else { /* Non-zero len */
2987 if (flags & SCF_DO_STCLASS_OR) {
2988 cl_or(pRExC_state, data->start_class, &this_class);
2989 cl_and(data->start_class, and_withp);
2990 }
2991 else if (flags & SCF_DO_STCLASS_AND)
2992 cl_and(data->start_class, &this_class);
2993 flags &= ~SCF_DO_STCLASS;
2994 }
2995 if (!scan) /* It was not CURLYX, but CURLY. */
2996 scan = next;
2997 if ( /* ? quantifier ok, except for (?{ ... }) */
2998 (next_is_eval || !(mincount == 0 && maxcount == 1))
2999 && (minnext == 0) && (deltanext == 0)
3000 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3001 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3002 && ckWARN(WARN_REGEXP))
3003 {
3004 vWARN(RExC_parse,
3005 "Quantifier unexpected on zero-length expression");
3006 }
3007
3008 min += minnext * mincount;
3009 is_inf_internal |= ((maxcount == REG_INFTY
3010 && (minnext + deltanext) > 0)
3011 || deltanext == I32_MAX);
3012 is_inf |= is_inf_internal;
3013 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3014
3015 /* Try powerful optimization CURLYX => CURLYN. */
3016 if ( OP(oscan) == CURLYX && data
3017 && data->flags & SF_IN_PAR
3018 && !(data->flags & SF_HAS_EVAL)
3019 && !deltanext && minnext == 1 ) {
3020 /* Try to optimize to CURLYN. */
3021 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3022 regnode * const nxt1 = nxt;
497b47a8 3023#ifdef DEBUGGING
8aa23a47 3024 regnode *nxt2;
497b47a8 3025#endif
c277df42 3026
8aa23a47
YO
3027 /* Skip open. */
3028 nxt = regnext(nxt);
3029 if (!strchr((const char*)PL_simple,OP(nxt))
3030 && !(PL_regkind[OP(nxt)] == EXACT
3031 && STR_LEN(nxt) == 1))
3032 goto nogo;
497b47a8 3033#ifdef DEBUGGING
8aa23a47 3034 nxt2 = nxt;
497b47a8 3035#endif
8aa23a47
YO
3036 nxt = regnext(nxt);
3037 if (OP(nxt) != CLOSE)
3038 goto nogo;
3039 if (RExC_open_parens) {
3040 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3041 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3042 }
3043 /* Now we know that nxt2 is the only contents: */
3044 oscan->flags = (U8)ARG(nxt);
3045 OP(oscan) = CURLYN;
3046 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3047
c277df42 3048#ifdef DEBUGGING
8aa23a47
YO
3049 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3050 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3051 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3052 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3053 OP(nxt + 1) = OPTIMIZED; /* was count. */
3054 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3055#endif
8aa23a47
YO
3056 }
3057 nogo:
3058
3059 /* Try optimization CURLYX => CURLYM. */
3060 if ( OP(oscan) == CURLYX && data
3061 && !(data->flags & SF_HAS_PAR)
3062 && !(data->flags & SF_HAS_EVAL)
3063 && !deltanext /* atom is fixed width */
3064 && minnext != 0 /* CURLYM can't handle zero width */
3065 ) {
3066 /* XXXX How to optimize if data == 0? */
3067 /* Optimize to a simpler form. */
3068 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3069 regnode *nxt2;
3070
3071 OP(oscan) = CURLYM;
3072 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3073 && (OP(nxt2) != WHILEM))
3074 nxt = nxt2;
3075 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3076 /* Need to optimize away parenths. */
3077 if (data->flags & SF_IN_PAR) {
3078 /* Set the parenth number. */
3079 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3080
3081 if (OP(nxt) != CLOSE)
3082 FAIL("Panic opt close");
3083 oscan->flags = (U8)ARG(nxt);
3084 if (RExC_open_parens) {
3085 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3086 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3087 }
8aa23a47
YO
3088 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3089 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3090
c277df42 3091#ifdef DEBUGGING
8aa23a47
YO
3092 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3093 OP(nxt + 1) = OPTIMIZED; /* was count. */
3094 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3095 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3096#endif
c277df42 3097#if 0
8aa23a47
YO
3098 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3099 regnode *nnxt = regnext(nxt1);
3100
3101 if (nnxt == nxt) {
3102 if (reg_off_by_arg[OP(nxt1)])
3103 ARG_SET(nxt1, nxt2 - nxt1);
3104 else if (nxt2 - nxt1 < U16_MAX)
3105 NEXT_OFF(nxt1) = nxt2 - nxt1;
3106 else
3107 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3108 }
8aa23a47 3109 nxt1 = nnxt;
c277df42 3110 }
5d1c421c 3111#endif
8aa23a47
YO
3112 /* Optimize again: */
3113 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3114 NULL, stopparen, recursed, NULL, 0,depth+1);
3115 }
3116 else
3117 oscan->flags = 0;
3118 }
3119 else if ((OP(oscan) == CURLYX)
3120 && (flags & SCF_WHILEM_VISITED_POS)
3121 /* See the comment on a similar expression above.
3122 However, this time it not a subexpression
3123 we care about, but the expression itself. */
3124 && (maxcount == REG_INFTY)
3125 && data && ++data->whilem_c < 16) {
3126 /* This stays as CURLYX, we can put the count/of pair. */
3127 /* Find WHILEM (as in regexec.c) */
3128 regnode *nxt = oscan + NEXT_OFF(oscan);
3129
3130 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3131 nxt += ARG(nxt);
3132 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3133 | (RExC_whilem_seen << 4)); /* On WHILEM */
3134 }
3135 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3136 pars++;
3137 if (flags & SCF_DO_SUBSTR) {
3138 SV *last_str = NULL;
3139 int counted = mincount != 0;
a0ed51b3 3140
8aa23a47
YO
3141 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3142#if defined(SPARC64_GCC_WORKAROUND)
3143 I32 b = 0;
3144 STRLEN l = 0;
3145 const char *s = NULL;
3146 I32 old = 0;
b515a41d 3147
8aa23a47
YO
3148 if (pos_before >= data->last_start_min)
3149 b = pos_before;
3150 else
3151 b = data->last_start_min;
b515a41d 3152
8aa23a47
YO
3153 l = 0;
3154 s = SvPV_const(data->last_found, l);
3155 old = b - data->last_start_min;
3156
3157#else
3158 I32 b = pos_before >= data->last_start_min
3159 ? pos_before : data->last_start_min;
3160 STRLEN l;
3161 const char * const s = SvPV_const(data->last_found, l);
3162 I32 old = b - data->last_start_min;
3163#endif
3164
3165 if (UTF)
3166 old = utf8_hop((U8*)s, old) - (U8*)s;
3167
3168 l -= old;
3169 /* Get the added string: */
3170 last_str = newSVpvn(s + old, l);
3171 if (UTF)
3172 SvUTF8_on(last_str);
3173 if (deltanext == 0 && pos_before == b) {
3174 /* What was added is a constant string */
3175 if (mincount > 1) {
3176 SvGROW(last_str, (mincount * l) + 1);
3177 repeatcpy(SvPVX(last_str) + l,
3178 SvPVX_const(last_str), l, mincount - 1);
3179 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3180 /* Add additional parts. */
3181 SvCUR_set(data->last_found,
3182 SvCUR(data->last_found) - l);
3183 sv_catsv(data->last_found, last_str);
3184 {
3185 SV * sv = data->last_found;
3186 MAGIC *mg =
3187 SvUTF8(sv) && SvMAGICAL(sv) ?
3188 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3189 if (mg && mg->mg_len >= 0)
3190 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3191 }
8aa23a47 3192 data->last_end += l * (mincount - 1);
b515a41d 3193 }
8aa23a47
YO
3194 } else {
3195 /* start offset must point into the last copy */
3196 data->last_start_min += minnext * (mincount - 1);
3197 data->last_start_max += is_inf ? I32_MAX
3198 : (maxcount - 1) * (minnext + data->pos_delta);
3199 }
c277df42 3200 }
8aa23a47
YO
3201 /* It is counted once already... */
3202 data->pos_min += minnext * (mincount - counted);
3203 data->pos_delta += - counted * deltanext +
3204 (minnext + deltanext) * maxcount - minnext * mincount;
3205 if (mincount != maxcount) {
3206 /* Cannot extend fixed substrings found inside
3207 the group. */
3208 scan_commit(pRExC_state,data,minlenp);
3209 if (mincount && last_str) {
3210 SV * const sv = data->last_found;
3211 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3212 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3213
3214 if (mg)
3215 mg->mg_len = -1;
3216 sv_setsv(sv, last_str);
3217 data->last_end = data->pos_min;
3218 data->last_start_min =
3219 data->pos_min - CHR_SVLEN(last_str);
3220 data->last_start_max = is_inf
3221 ? I32_MAX
3222 : data->pos_min + data->pos_delta
3223 - CHR_SVLEN(last_str);
3224 }
3225 data->longest = &(data->longest_float);
3226 }
3227 SvREFCNT_dec(last_str);
c277df42 3228 }
8aa23a47
YO
3229 if (data && (fl & SF_HAS_EVAL))
3230 data->flags |= SF_HAS_EVAL;
3231 optimize_curly_tail:
3232 if (OP(oscan) != CURLYX) {
3233 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3234 && NEXT_OFF(next))
3235 NEXT_OFF(oscan) += NEXT_OFF(next);
3236 }
3237 continue;
3238 default: /* REF and CLUMP only? */
3239 if (flags & SCF_DO_SUBSTR) {
3240 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3241 data->longest = &(data->longest_float);
3242 }
3243 is_inf = is_inf_internal = 1;
3244 if (flags & SCF_DO_STCLASS_OR)
3245 cl_anything(pRExC_state, data->start_class);
3246 flags &= ~SCF_DO_STCLASS;
3247 break;
c277df42 3248 }
8aa23a47
YO
3249 }
3250 else if (strchr((const char*)PL_simple,OP(scan))) {
3251 int value = 0;
653099ff 3252
8aa23a47
YO
3253 if (flags & SCF_DO_SUBSTR) {
3254 scan_commit(pRExC_state,data,minlenp);
3255 data->pos_min++;
3256 }
3257 min++;
3258 if (flags & SCF_DO_STCLASS) {
3259 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3260
8aa23a47
YO
3261 /* Some of the logic below assumes that switching
3262 locale on will only add false positives. */
3263 switch (PL_regkind[OP(scan)]) {
3264 case SANY:
3265 default:
3266 do_default:
3267 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3268 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3269 cl_anything(pRExC_state, data->start_class);
3270 break;
3271 case REG_ANY:
3272 if (OP(scan) == SANY)
3273 goto do_default;
3274 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3275 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3276 || (data->start_class->flags & ANYOF_CLASS));
3277 cl_anything(pRExC_state, data->start_class);
653099ff 3278 }
8aa23a47
YO
3279 if (flags & SCF_DO_STCLASS_AND || !value)
3280 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3281 break;
3282 case ANYOF:
3283 if (flags & SCF_DO_STCLASS_AND)
3284 cl_and(data->start_class,
3285 (struct regnode_charclass_class*)scan);
653099ff 3286 else
8aa23a47
YO
3287 cl_or(pRExC_state, data->start_class,
3288 (struct regnode_charclass_class*)scan);
3289 break;
3290 case ALNUM:
3291 if (flags & SCF_DO_STCLASS_AND) {
3292 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3293 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3294 for (value = 0; value < 256; value++)
3295 if (!isALNUM(value))
3296 ANYOF_BITMAP_CLEAR(data->start_class, value);
3297 }
653099ff 3298 }
8aa23a47
YO
3299 else {
3300 if (data->start_class->flags & ANYOF_LOCALE)
3301 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3302 else {
3303 for (value = 0; value < 256; value++)
3304 if (isALNUM(value))
3305 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3306 }
8aa23a47
YO
3307 }
3308 break;
3309 case ALNUML:
3310 if (flags & SCF_DO_STCLASS_AND) {
3311 if (data->start_class->flags & ANYOF_LOCALE)
3312 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3313 }
3314 else {
3315 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3316 data->start_class->flags |= ANYOF_LOCALE;
3317 }
3318 break;
3319 case NALNUM:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3322 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3323 for (value = 0; value < 256; value++)
3324 if (isALNUM(value))
3325 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3326 }
3327 }
8aa23a47
YO
3328 else {
3329 if (data->start_class->flags & ANYOF_LOCALE)
3330 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3331 else {
3332 for (value = 0; value < 256; value++)
3333 if (!isALNUM(value))
3334 ANYOF_BITMAP_SET(data->start_class, value);
3335 }
653099ff 3336 }
8aa23a47
YO
3337 break;
3338 case NALNUML:
3339 if (flags & SCF_DO_STCLASS_AND) {
3340 if (data->start_class->flags & ANYOF_LOCALE)
3341 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3342 }
8aa23a47
YO
3343 else {
3344 data->start_class->flags |= ANYOF_LOCALE;
3345 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3346 }
3347 break;
3348 case SPACE:
3349 if (flags & SCF_DO_STCLASS_AND) {
3350 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3351 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3352 for (value = 0; value < 256; value++)
3353 if (!isSPACE(value))
3354 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3355 }
3356 }
8aa23a47
YO
3357 else {
3358 if (data->start_class->flags & ANYOF_LOCALE)
3359 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3360 else {
3361 for (value = 0; value < 256; value++)
3362 if (isSPACE(value))
3363 ANYOF_BITMAP_SET(data->start_class, value);
3364 }
653099ff 3365 }
8aa23a47
YO
3366 break;
3367 case SPACEL:
3368 if (flags & SCF_DO_STCLASS_AND) {
3369 if (data->start_class->flags & ANYOF_LOCALE)
3370 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3371 }
3372 else {
3373 data->start_class->flags |= ANYOF_LOCALE;
3374 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3375 }
3376 break;
3377 case NSPACE:
3378 if (flags & SCF_DO_STCLASS_AND) {
3379 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3380 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3381 for (value = 0; value < 256; value++)
3382 if (isSPACE(value))
3383 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3384 }
8aa23a47
YO
3385 }
3386 else {
3387 if (data->start_class->flags & ANYOF_LOCALE)
3388 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3389 else {
3390 for (value = 0; value < 256; value++)
3391 if (!isSPACE(value))
3392 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3393 }
3394 }
8aa23a47
YO
3395 break;
3396 case NSPACEL:
3397 if (flags & SCF_DO_STCLASS_AND) {
3398 if (data->start_class->flags & ANYOF_LOCALE) {
3399 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3400 for (value = 0; value < 256; value++)
3401 if (!isSPACE(value))
3402 ANYOF_BITMAP_CLEAR(data->start_class, value);
3403 }
653099ff 3404 }
8aa23a47
YO
3405 else {
3406 data->start_class->flags |= ANYOF_LOCALE;
3407 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3408 }
3409 break;
3410 case DIGIT:
3411 if (flags & SCF_DO_STCLASS_AND) {
3412 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3413 for (value = 0; value < 256; value++)
3414 if (!isDIGIT(value))
3415 ANYOF_BITMAP_CLEAR(data->start_class, value);
3416 }
3417 else {
3418 if (data->start_class->flags & ANYOF_LOCALE)
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3420 else {
3421 for (value = 0; value < 256; value++)
3422 if (isDIGIT(value))
3423 ANYOF_BITMAP_SET(data->start_class, value);
3424 }
3425 }
3426 break;
3427 case NDIGIT:
3428 if (flags & SCF_DO_STCLASS_AND) {
3429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3430 for (value = 0; value < 256; value++)
3431 if (isDIGIT(value))
3432 ANYOF_BITMAP_CLEAR(data->start_class, value);
3433 }
3434 else {
3435 if (data->start_class->flags & ANYOF_LOCALE)
3436 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3437 else {
3438 for (value = 0; value < 256; value++)
3439 if (!isDIGIT(value))
3440 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3441 }
3442 }
8aa23a47
YO
3443 break;
3444 }
3445 if (flags & SCF_DO_STCLASS_OR)
3446 cl_and(data->start_class, and_withp);
3447 flags &= ~SCF_DO_STCLASS;
3448 }
3449 }
3450 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3451 data->flags |= (OP(scan) == MEOL
3452 ? SF_BEFORE_MEOL
3453 : SF_BEFORE_SEOL);
3454 }
3455 else if ( PL_regkind[OP(scan)] == BRANCHJ
3456 /* Lookbehind, or need to calculate parens/evals/stclass: */
3457 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3458 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3459 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3460 || OP(scan) == UNLESSM )
3461 {
3462 /* Negative Lookahead/lookbehind
3463 In this case we can't do fixed string optimisation.
3464 */
1de06328 3465
8aa23a47
YO
3466 I32 deltanext, minnext, fake = 0;
3467 regnode *nscan;
3468 struct regnode_charclass_class intrnl;
3469 int f = 0;
1de06328 3470
8aa23a47
YO
3471 data_fake.flags = 0;
3472 if (data) {
3473 data_fake.whilem_c = data->whilem_c;
3474 data_fake.last_closep = data->last_closep;
c277df42 3475 }
8aa23a47
YO
3476 else
3477 data_fake.last_closep = &fake;
3478 if ( flags & SCF_DO_STCLASS && !scan->flags
3479 && OP(scan) == IFMATCH ) { /* Lookahead */
3480 cl_init(pRExC_state, &intrnl);
3481 data_fake.start_class = &intrnl;
3482 f |= SCF_DO_STCLASS_AND;
3483 }
3484 if (flags & SCF_WHILEM_VISITED_POS)
3485 f |= SCF_WHILEM_VISITED_POS;
3486 next = regnext(scan);
3487 nscan = NEXTOPER(NEXTOPER(scan));
3488 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3489 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3490 if (scan->flags) {
3491 if (deltanext) {
3492 vFAIL("Variable length lookbehind not implemented");
3493 }
3494 else if (minnext > (I32)U8_MAX) {
3495 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3496 }
3497 scan->flags = (U8)minnext;
3498 }
3499 if (data) {
3500 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3501 pars++;
3502 if (data_fake.flags & SF_HAS_EVAL)
3503 data->flags |= SF_HAS_EVAL;
3504 data->whilem_c = data_fake.whilem_c;
3505 }
3506 if (f & SCF_DO_STCLASS_AND) {
3507 const int was = (data->start_class->flags & ANYOF_EOS);
3508
3509 cl_and(data->start_class, &intrnl);
3510 if (was)
3511 data->start_class->flags |= ANYOF_EOS;
3512 }
cb434fcc 3513 }
8aa23a47
YO
3514#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3515 else {
3516 /* Positive Lookahead/lookbehind
3517 In this case we can do fixed string optimisation,
3518 but we must be careful about it. Note in the case of
3519 lookbehind the positions will be offset by the minimum
3520 length of the pattern, something we won't know about
3521 until after the recurse.
3522 */
3523 I32 deltanext, fake = 0;
3524 regnode *nscan;
3525 struct regnode_charclass_class intrnl;
3526 int f = 0;
3527 /* We use SAVEFREEPV so that when the full compile
3528 is finished perl will clean up the allocated
3529 minlens when its all done. This was we don't
3530 have to worry about freeing them when we know
3531 they wont be used, which would be a pain.
3532 */
3533 I32 *minnextp;
3534 Newx( minnextp, 1, I32 );
3535 SAVEFREEPV(minnextp);
3536
3537 if (data) {
3538 StructCopy(data, &data_fake, scan_data_t);
3539 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3540 f |= SCF_DO_SUBSTR;
3541 if (scan->flags)
3542 scan_commit(pRExC_state, &data_fake,minlenp);
3543 data_fake.last_found=newSVsv(data->last_found);
3544 }
3545 }
3546 else
3547 data_fake.last_closep = &fake;
3548 data_fake.flags = 0;
3549 if (is_inf)
3550 data_fake.flags |= SF_IS_INF;
3551 if ( flags & SCF_DO_STCLASS && !scan->flags
3552 && OP(scan) == IFMATCH ) { /* Lookahead */
3553 cl_init(pRExC_state, &intrnl);
3554 data_fake.start_class = &intrnl;
3555 f |= SCF_DO_STCLASS_AND;
3556 }
3557 if (flags & SCF_WHILEM_VISITED_POS)
3558 f |= SCF_WHILEM_VISITED_POS;
3559 next = regnext(scan);
3560 nscan = NEXTOPER(NEXTOPER(scan));
3561
3562 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3563 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3564 if (scan->flags) {
3565 if (deltanext) {
3566 vFAIL("Variable length lookbehind not implemented");
3567 }
3568 else if (*minnextp > (I32)U8_MAX) {
3569 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3570 }
3571 scan->flags = (U8)*minnextp;
3572 }
3573
3574 *minnextp += min;
3575
3576 if (f & SCF_DO_STCLASS_AND) {
3577 const int was = (data->start_class->flags & ANYOF_EOS);
3578
3579 cl_and(data->start_class, &intrnl);
3580 if (was)
3581 data->start_class->flags |= ANYOF_EOS;
3582 }
3583 if (data) {
3584 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3585 pars++;
3586 if (data_fake.flags & SF_HAS_EVAL)
3587 data->flags |= SF_HAS_EVAL;
3588 data->whilem_c = data_fake.whilem_c;
3589 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3590 if (RExC_rx->minlen<*minnextp)
3591 RExC_rx->minlen=*minnextp;
3592 scan_commit(pRExC_state, &data_fake, minnextp);
3593 SvREFCNT_dec(data_fake.last_found);
3594
3595 if ( data_fake.minlen_fixed != minlenp )
3596 {
3597 data->offset_fixed= data_fake.offset_fixed;
3598 data->minlen_fixed= data_fake.minlen_fixed;
3599 data->lookbehind_fixed+= scan->flags;
3600 }
3601 if ( data_fake.minlen_float != minlenp )
3602 {
3603 data->minlen_float= data_fake.minlen_float;
3604 data->offset_float_min=data_fake.offset_float_min;
3605 data->offset_float_max=data_fake.offset_float_max;
3606 data->lookbehind_float+= scan->flags;
3607 }
3608 }
3609 }
3610
3611
40d049e4 3612 }
8aa23a47
YO
3613#endif
3614 }
3615 else if (OP(scan) == OPEN) {
3616 if (stopparen != (I32)ARG(scan))
3617 pars++;
3618 }
3619 else if (OP(scan) == CLOSE) {
3620 if (stopparen == (I32)ARG(scan)) {
3621 break;
3622 }
3623 if ((I32)ARG(scan) == is_par) {
3624 next = regnext(scan);
b515a41d 3625
8aa23a47
YO
3626 if ( next && (OP(next) != WHILEM) && next < last)
3627 is_par = 0; /* Disable optimization */
40d049e4 3628 }
8aa23a47
YO
3629 if (data)
3630 *(data->last_closep) = ARG(scan);
3631 }
3632 else if (OP(scan) == EVAL) {
c277df42
IZ
3633 if (data)
3634 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3635 }
3636 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3637 if (flags & SCF_DO_SUBSTR) {
3638 scan_commit(pRExC_state,data,minlenp);
3639 flags &= ~SCF_DO_SUBSTR;
40d049e4 3640 }
8aa23a47
YO
3641 if (data && OP(scan)==ACCEPT) {
3642 data->flags |= SCF_SEEN_ACCEPT;
3643 if (stopmin > min)
3644 stopmin = min;
e2e6a0f1 3645 }
8aa23a47
YO
3646 }
3647 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3648 {
0f5d15d6 3649 if (flags & SCF_DO_SUBSTR) {
1de06328 3650 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3651 data->longest = &(data->longest_float);
3652 }
3653 is_inf = is_inf_internal = 1;
653099ff 3654 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3655 cl_anything(pRExC_state, data->start_class);
96776eda 3656 flags &= ~SCF_DO_STCLASS;
8aa23a47 3657 }
786e8c11 3658#ifdef TRIE_STUDY_OPT
40d049e4 3659#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3660 else if (PL_regkind[OP(scan)] == TRIE) {
3661 /* NOTE - There is similar code to this block above for handling
3662 BRANCH nodes on the initial study. If you change stuff here
3663 check there too. */
3664 regnode *trie_node= scan;
3665 regnode *tail= regnext(scan);
3666 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3667 I32 max1 = 0, min1 = I32_MAX;
3668 struct regnode_charclass_class accum;
3669
3670 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3671 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3672 if (flags & SCF_DO_STCLASS)
3673 cl_init_zero(pRExC_state, &accum);
3674
3675 if (!trie->jump) {
3676 min1= trie->minlen;
3677 max1= trie->maxlen;
3678 } else {
3679 const regnode *nextbranch= NULL;
3680 U32 word;
3681
3682 for ( word=1 ; word <= trie->wordcount ; word++)
3683 {
3684 I32 deltanext=0, minnext=0, f = 0, fake;
3685 struct regnode_charclass_class this_class;
3686
3687 data_fake.flags = 0;
3688 if (data) {
3689 data_fake.whilem_c = data->whilem_c;
3690 data_fake.last_closep = data->last_closep;
3691 }
3692 else
3693 data_fake.last_closep = &fake;
3694
3695 if (flags & SCF_DO_STCLASS) {
3696 cl_init(pRExC_state, &this_class);
3697 data_fake.start_class = &this_class;
3698 f = SCF_DO_STCLASS_AND;
3699 }
3700 if (flags & SCF_WHILEM_VISITED_POS)
3701 f |= SCF_WHILEM_VISITED_POS;
3702
3703 if (trie->jump[word]) {
3704 if (!nextbranch)
3705 nextbranch = trie_node + trie->jump[0];
3706 scan= trie_node + trie->jump[word];
3707 /* We go from the jump point to the branch that follows
3708 it. Note this means we need the vestigal unused branches
3709 even though they arent otherwise used.
3710 */
3711 minnext = study_chunk(pRExC_state, &scan, minlenp,
3712 &deltanext, (regnode *)nextbranch, &data_fake,
3713 stopparen, recursed, NULL, f,depth+1);
3714 }
3715 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3716 nextbranch= regnext((regnode*)nextbranch);
3717
3718 if (min1 > (I32)(minnext + trie->minlen))
3719 min1 = minnext + trie->minlen;
3720 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3721 max1 = minnext + deltanext + trie->maxlen;
3722 if (deltanext == I32_MAX)
3723 is_inf = is_inf_internal = 1;
3724
3725 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3726 pars++;
3727 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3728 if ( stopmin > min + min1)
3729 stopmin = min + min1;
3730 flags &= ~SCF_DO_SUBSTR;
3731 if (data)
3732 data->flags |= SCF_SEEN_ACCEPT;
3733 }
3734 if (data) {
3735 if (data_fake.flags & SF_HAS_EVAL)
3736 data->flags |= SF_HAS_EVAL;
3737 data->whilem_c = data_fake.whilem_c;
3738 }
3739 if (flags & SCF_DO_STCLASS)
3740 cl_or(pRExC_state, &accum, &this_class);
3741 }
3742 }
3743 if (flags & SCF_DO_SUBSTR) {
3744 data->pos_min += min1;
3745 data->pos_delta += max1 - min1;
3746 if (max1 != min1 || is_inf)
3747 data->longest = &(data->longest_float);
3748 }
3749 min += min1;
3750 delta += max1 - min1;
3751 if (flags & SCF_DO_STCLASS_OR) {
3752 cl_or(pRExC_state, data->start_class, &accum);
3753 if (min1) {
3754 cl_and(data->start_class, and_withp);
3755 flags &= ~SCF_DO_STCLASS;
3756 }
3757 }
3758 else if (flags & SCF_DO_STCLASS_AND) {
3759 if (min1) {
3760 cl_and(data->start_class, &accum);
3761 flags &= ~SCF_DO_STCLASS;
3762 }
3763 else {
3764 /* Switch to OR mode: cache the old value of
3765 * data->start_class */
3766 INIT_AND_WITHP;
3767 StructCopy(data->start_class, and_withp,
3768 struct regnode_charclass_class);
3769 flags &= ~SCF_DO_STCLASS_AND;
3770 StructCopy(&accum, data->start_class,
3771 struct regnode_charclass_class);
3772 flags |= SCF_DO_STCLASS_OR;
3773 data->start_class->flags |= ANYOF_EOS;
3774 }
3775 }
3776 scan= tail;
3777 continue;
3778 }
786e8c11 3779#else
8aa23a47
YO
3780 else if (PL_regkind[OP(scan)] == TRIE) {
3781 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3782 U8*bang=NULL;
3783
3784 min += trie->minlen;
3785 delta += (trie->maxlen - trie->minlen);
3786 flags &= ~SCF_DO_STCLASS; /* xxx */
3787 if (flags & SCF_DO_SUBSTR) {
3788 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3789 data->pos_min += trie->minlen;
3790 data->pos_delta += (trie->maxlen - trie->minlen);
3791 if (trie->maxlen != trie->minlen)
3792 data->longest = &(data->longest_float);
3793 }
3794 if (trie->jump) /* no more substrings -- for now /grr*/
3795 flags &= ~SCF_DO_SUBSTR;
b515a41d 3796 }
8aa23a47
YO
3797#endif /* old or new */
3798#endif /* TRIE_STUDY_OPT */
3799 /* Else: zero-length, ignore. */
3800 scan = regnext(scan);
3801 }
3802 if (frame) {
3803 last = frame->last;
3804 scan = frame->next;
3805 stopparen = frame->stop;
3806 frame = frame->prev;
3807 goto fake_study_recurse;
c277df42
IZ
3808 }
3809
3810 finish:
8aa23a47
YO
3811 assert(!frame);
3812
c277df42 3813 *scanp = scan;
aca2d497 3814 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3815 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3816 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3817 if (is_par > (I32)U8_MAX)
c277df42
IZ
3818 is_par = 0;
3819 if (is_par && pars==1 && data) {
3820 data->flags |= SF_IN_PAR;
3821 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3822 }
3823 else if (pars && data) {
c277df42
IZ
3824 data->flags |= SF_HAS_PAR;
3825 data->flags &= ~SF_IN_PAR;
3826 }
653099ff 3827 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3828 cl_and(data->start_class, and_withp);
786e8c11
YO
3829 if (flags & SCF_TRIE_RESTUDY)
3830 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3831
3832 DEBUG_STUDYDATA(data,depth);
3833
e2e6a0f1 3834 return min < stopmin ? min : stopmin;
c277df42
IZ
3835}
3836
76e3520e 3837STATIC I32
5f66b61c 3838S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3839{
830247a4 3840 if (RExC_rx->data) {
2eb97020 3841 const U32 count = RExC_rx->data->count;
b81d288d 3842 Renewc(RExC_rx->data,
2eb97020 3843 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3844 char, struct reg_data);
2eb97020 3845 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3846 RExC_rx->data->count += n;
a0ed51b3
LW
3847 }
3848 else {
a02a5408 3849 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3850 char, struct reg_data);
a02a5408 3851 Newx(RExC_rx->data->what, n, U8);
830247a4 3852 RExC_rx->data->count = n;
c277df42 3853 }
830247a4
IZ
3854 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3855 return RExC_rx->data->count - n;
c277df42
IZ
3856}
3857
76234dfb 3858#ifndef PERL_IN_XSUB_RE
d88dccdf 3859void
864dbfa3 3860Perl_reginitcolors(pTHX)
d88dccdf 3861{
97aff369 3862 dVAR;
1df70142 3863 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3864 if (s) {
1df70142
AL
3865 char *t = savepv(s);
3866 int i = 0;
3867 PL_colors[0] = t;
d88dccdf 3868 while (++i < 6) {
1df70142
AL
3869 t = strchr(t, '\t');
3870 if (t) {
3871 *t = '\0';
3872 PL_colors[i] = ++t;
d88dccdf
IZ
3873 }
3874 else
1df70142 3875 PL_colors[i] = t = (char *)"";