This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Module::Pluggable diff had some hidden binary files in it. I'm
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
c74340f9
YO
116 I32 npar; /* Capture buffer count, (OPEN). */
117 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 118 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
119 I32 extralen;
120 I32 seen_zerolen;
121 I32 seen_evals;
40d049e4
YO
122 regnode **open_parens; /* pointers to open parens */
123 regnode **close_parens; /* pointers to close parens */
124 regnode *opend; /* END node in program */
1aa99e6b 125 I32 utf8;
6bda09f9 126 HV *charnames; /* cache of named sequences */
81714fb9 127 HV *paren_names; /* Paren names */
40d049e4
YO
128 regnode **recurse; /* Recurse regops */
129 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
3dab1dad 134#ifdef DEBUGGING
be8e71aa 135 const char *lastparse;
3dab1dad
YO
136 I32 lastnum;
137#define RExC_lastparse (pRExC_state->lastparse)
138#define RExC_lastnum (pRExC_state->lastnum)
139#endif
830247a4
IZ
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
c74340f9 157#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 158#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
159#define RExC_extralen (pRExC_state->extralen)
160#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
161#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 162#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 163#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
164#define RExC_open_parens (pRExC_state->open_parens)
165#define RExC_close_parens (pRExC_state->close_parens)
166#define RExC_opend (pRExC_state->opend)
81714fb9 167#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
168#define RExC_recurse (pRExC_state->recurse)
169#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 170
a687059c
LW
171#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
172#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
173 ((*s) == '{' && regcurly(s)))
a687059c 174
35c8bce7
LW
175#ifdef SPSTART
176#undef SPSTART /* dratted cpp namespace... */
177#endif
a687059c
LW
178/*
179 * Flags to be passed up and down.
180 */
a687059c 181#define WORST 0 /* Worst case. */
821b33a5 182#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
183#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
184#define SPSTART 0x4 /* Starts with * or +. */
185#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 186
3dab1dad
YO
187#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
188
07be1b83
YO
189/* whether trie related optimizations are enabled */
190#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
191#define TRIE_STUDY_OPT
786e8c11 192#define FULL_TRIE_STUDY
07be1b83
YO
193#define TRIE_STCLASS
194#endif
1de06328
YO
195
196
40d049e4
YO
197
198#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
199#define PBITVAL(paren) (1 << ((paren) & 7))
200#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
201#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
202#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
203
204
1de06328
YO
205/* About scan_data_t.
206
207 During optimisation we recurse through the regexp program performing
208 various inplace (keyhole style) optimisations. In addition study_chunk
209 and scan_commit populate this data structure with information about
210 what strings MUST appear in the pattern. We look for the longest
211 string that must appear for at a fixed location, and we look for the
212 longest string that may appear at a floating location. So for instance
213 in the pattern:
214
215 /FOO[xX]A.*B[xX]BAR/
216
217 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
218 strings (because they follow a .* construct). study_chunk will identify
219 both FOO and BAR as being the longest fixed and floating strings respectively.
220
221 The strings can be composites, for instance
222
223 /(f)(o)(o)/
224
225 will result in a composite fixed substring 'foo'.
226
227 For each string some basic information is maintained:
228
229 - offset or min_offset
230 This is the position the string must appear at, or not before.
231 It also implicitly (when combined with minlenp) tells us how many
232 character must match before the string we are searching.
233 Likewise when combined with minlenp and the length of the string
234 tells us how many characters must appear after the string we have
235 found.
236
237 - max_offset
238 Only used for floating strings. This is the rightmost point that
239 the string can appear at. Ifset to I32 max it indicates that the
240 string can occur infinitely far to the right.
241
242 - minlenp
243 A pointer to the minimum length of the pattern that the string
244 was found inside. This is important as in the case of positive
245 lookahead or positive lookbehind we can have multiple patterns
246 involved. Consider
247
248 /(?=FOO).*F/
249
250 The minimum length of the pattern overall is 3, the minimum length
251 of the lookahead part is 3, but the minimum length of the part that
252 will actually match is 1. So 'FOO's minimum length is 3, but the
253 minimum length for the F is 1. This is important as the minimum length
254 is used to determine offsets in front of and behind the string being
255 looked for. Since strings can be composites this is the length of the
256 pattern at the time it was commited with a scan_commit. Note that
257 the length is calculated by study_chunk, so that the minimum lengths
258 are not known until the full pattern has been compiled, thus the
259 pointer to the value.
260
261 - lookbehind
262
263 In the case of lookbehind the string being searched for can be
264 offset past the start point of the final matching string.
265 If this value was just blithely removed from the min_offset it would
266 invalidate some of the calculations for how many chars must match
267 before or after (as they are derived from min_offset and minlen and
268 the length of the string being searched for).
269 When the final pattern is compiled and the data is moved from the
270 scan_data_t structure into the regexp structure the information
271 about lookbehind is factored in, with the information that would
272 have been lost precalculated in the end_shift field for the
273 associated string.
274
275 The fields pos_min and pos_delta are used to store the minimum offset
276 and the delta to the maximum offset at the current point in the pattern.
277
278*/
2c2d71f5
JH
279
280typedef struct scan_data_t {
1de06328
YO
281 /*I32 len_min; unused */
282 /*I32 len_delta; unused */
2c2d71f5
JH
283 I32 pos_min;
284 I32 pos_delta;
285 SV *last_found;
1de06328 286 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
287 I32 last_start_min;
288 I32 last_start_max;
1de06328
YO
289 SV **longest; /* Either &l_fixed, or &l_float. */
290 SV *longest_fixed; /* longest fixed string found in pattern */
291 I32 offset_fixed; /* offset where it starts */
292 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
293 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
294 SV *longest_float; /* longest floating string found in pattern */
295 I32 offset_float_min; /* earliest point in string it can appear */
296 I32 offset_float_max; /* latest point in string it can appear */
297 I32 *minlen_float; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
299 I32 flags;
300 I32 whilem_c;
cb434fcc 301 I32 *last_closep;
653099ff 302 struct regnode_charclass_class *start_class;
2c2d71f5
JH
303} scan_data_t;
304
a687059c 305/*
e50aee73 306 * Forward declarations for pregcomp()'s friends.
a687059c 307 */
a0d0e21e 308
27da23d5 309static const scan_data_t zero_scan_data =
1de06328 310 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
311
312#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
313#define SF_BEFORE_SEOL 0x0001
314#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
315#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
316#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
317
09b7f37c
CB
318#ifdef NO_UNARY_PLUS
319# define SF_FIX_SHIFT_EOL (0+2)
320# define SF_FL_SHIFT_EOL (0+4)
321#else
322# define SF_FIX_SHIFT_EOL (+2)
323# define SF_FL_SHIFT_EOL (+4)
324#endif
c277df42
IZ
325
326#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
327#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
328
329#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
330#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
331#define SF_IS_INF 0x0040
332#define SF_HAS_PAR 0x0080
333#define SF_IN_PAR 0x0100
334#define SF_HAS_EVAL 0x0200
335#define SCF_DO_SUBSTR 0x0400
653099ff
GS
336#define SCF_DO_STCLASS_AND 0x0800
337#define SCF_DO_STCLASS_OR 0x1000
338#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 339#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 340
786e8c11 341#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 342#define SCF_SEEN_ACCEPT 0x8000
07be1b83 343
eb160463 344#define UTF (RExC_utf8 != 0)
bbe252da
YO
345#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
346#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 347
ffc61ed2 348#define OOB_UNICODE 12345678
93733859 349#define OOB_NAMEDCLASS -1
b8c5462f 350
a0ed51b3
LW
351#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
352#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
353
8615cb43 354
b45f050a
JF
355/* length of regex to show in messages that don't mark a position within */
356#define RegexLengthToShowInErrorMessages 127
357
358/*
359 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
360 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
361 * op/pragma/warn/regcomp.
362 */
7253e4e3
RK
363#define MARKER1 "<-- HERE" /* marker as it appears in the description */
364#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 365
7253e4e3 366#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
367
368/*
369 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
370 * arg. Show regex, up to a maximum length. If it's too long, chop and add
371 * "...".
372 */
58e23c8d 373#define _FAIL(code) STMT_START { \
bfed75c6 374 const char *ellipses = ""; \
ccb2c380
MP
375 IV len = RExC_end - RExC_precomp; \
376 \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 if (len > RegexLengthToShowInErrorMessages) { \
380 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
381 len = RegexLengthToShowInErrorMessages - 10; \
382 ellipses = "..."; \
383 } \
58e23c8d 384 code; \
ccb2c380 385} STMT_END
8615cb43 386
58e23c8d
YO
387#define FAIL(msg) _FAIL( \
388 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
389 msg, (int)len, RExC_precomp, ellipses))
390
391#define FAIL2(msg,arg) _FAIL( \
392 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
393 arg, (int)len, RExC_precomp, ellipses))
394
b45f050a 395/*
b45f050a
JF
396 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
397 */
ccb2c380 398#define Simple_vFAIL(m) STMT_START { \
a28509cc 399 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
400 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
401 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
406 */
ccb2c380
MP
407#define vFAIL(m) STMT_START { \
408 if (!SIZE_ONLY) \
409 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
410 Simple_vFAIL(m); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Like Simple_vFAIL(), but accepts two arguments.
415 */
ccb2c380 416#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 417 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
418 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
419 (int)offset, RExC_precomp, RExC_precomp + offset); \
420} STMT_END
b45f050a
JF
421
422/*
423 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
424 */
ccb2c380
MP
425#define vFAIL2(m,a1) STMT_START { \
426 if (!SIZE_ONLY) \
427 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
428 Simple_vFAIL2(m, a1); \
429} STMT_END
b45f050a
JF
430
431
432/*
433 * Like Simple_vFAIL(), but accepts three arguments.
434 */
ccb2c380 435#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 436 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
443 */
ccb2c380
MP
444#define vFAIL3(m,a1,a2) STMT_START { \
445 if (!SIZE_ONLY) \
446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
447 Simple_vFAIL3(m, a1, a2); \
448} STMT_END
b45f050a
JF
449
450/*
451 * Like Simple_vFAIL(), but accepts four arguments.
452 */
ccb2c380 453#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 454 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
455 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
456 (int)offset, RExC_precomp, RExC_precomp + offset); \
457} STMT_END
b45f050a 458
ccb2c380 459#define vWARN(loc,m) STMT_START { \
a28509cc 460 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
461 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
462 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463} STMT_END
464
465#define vWARNdep(loc,m) STMT_START { \
a28509cc 466 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
467 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
468 "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472
473#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 474 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
475 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
476 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
478
479#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 480 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
481 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
482 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
483} STMT_END
484
485#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 486 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
487 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
488 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
489} STMT_END
490
491#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 492 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
493 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
9d1d55b5 496
8615cb43 497
cd439c50 498/* Allow for side effects in s */
ccb2c380
MP
499#define REGC(c,s) STMT_START { \
500 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
501} STMT_END
cd439c50 502
fac92740
MJD
503/* Macros for recording node offsets. 20001227 mjd@plover.com
504 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
505 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
506 * Element 0 holds the number n.
07be1b83 507 * Position is 1 indexed.
fac92740
MJD
508 */
509
ccb2c380
MP
510#define Set_Node_Offset_To_R(node,byte) STMT_START { \
511 if (! SIZE_ONLY) { \
512 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 513 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 514 if((node) < 0) { \
551405c4 515 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
516 } else { \
517 RExC_offsets[2*(node)-1] = (byte); \
518 } \
519 } \
520} STMT_END
521
522#define Set_Node_Offset(node,byte) \
523 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
524#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
525
526#define Set_Node_Length_To_R(node,len) STMT_START { \
527 if (! SIZE_ONLY) { \
528 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 529 __LINE__, (int)(node), (int)(len))); \
ccb2c380 530 if((node) < 0) { \
551405c4 531 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
532 } else { \
533 RExC_offsets[2*(node)] = (len); \
534 } \
535 } \
536} STMT_END
537
538#define Set_Node_Length(node,len) \
539 Set_Node_Length_To_R((node)-RExC_emit_start, len)
540#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
541#define Set_Node_Cur_Length(node) \
542 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
543
544/* Get offsets and lengths */
545#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
546#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
547
07be1b83
YO
548#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
549 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
550 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
551} STMT_END
552
553
554#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
555#define EXPERIMENTAL_INPLACESCAN
556#endif
557
1de06328 558#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 559DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 560 PerlIO_printf(Perl_debug_log, \
b515a41d 561 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
562 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
563 (int)(depth)*2, "", \
564 (IV)((data)->pos_min), \
565 (IV)((data)->pos_delta), \
566 (IV)((data)->flags), \
567 (IV)((data)->whilem_c), \
568 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
569 ); \
570 if ((data)->last_found) \
571 PerlIO_printf(Perl_debug_log, \
572 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
573 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
574 SvPVX_const((data)->last_found), \
575 (IV)((data)->last_end), \
576 (IV)((data)->last_start_min), \
577 (IV)((data)->last_start_max), \
578 ((data)->longest && \
579 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
580 SvPVX_const((data)->longest_fixed), \
581 (IV)((data)->offset_fixed), \
582 ((data)->longest && \
583 (data)->longest==&((data)->longest_float)) ? "*" : "", \
584 SvPVX_const((data)->longest_float), \
585 (IV)((data)->offset_float_min), \
586 (IV)((data)->offset_float_max) \
587 ); \
588 PerlIO_printf(Perl_debug_log,"\n"); \
589});
590
acfe0abc 591static void clear_re(pTHX_ void *r);
4327152a 592
653099ff 593/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 594 Update the longest found anchored substring and the longest found
653099ff
GS
595 floating substrings if needed. */
596
4327152a 597STATIC void
1de06328 598S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 599{
e1ec3a88
AL
600 const STRLEN l = CHR_SVLEN(data->last_found);
601 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 602 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 603
c277df42 604 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 605 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
606 if (*data->longest == data->longest_fixed) {
607 data->offset_fixed = l ? data->last_start_min : data->pos_min;
608 if (data->flags & SF_BEFORE_EOL)
b81d288d 609 data->flags
c277df42
IZ
610 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
611 else
612 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
613 data->minlen_fixed=minlenp;
614 data->lookbehind_fixed=0;
a0ed51b3
LW
615 }
616 else {
c277df42 617 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
618 data->offset_float_max = (l
619 ? data->last_start_max
c277df42 620 : data->pos_min + data->pos_delta);
9051bda5
HS
621 if ((U32)data->offset_float_max > (U32)I32_MAX)
622 data->offset_float_max = I32_MAX;
c277df42 623 if (data->flags & SF_BEFORE_EOL)
b81d288d 624 data->flags
c277df42
IZ
625 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
626 else
627 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
628 data->minlen_float=minlenp;
629 data->lookbehind_float=0;
c277df42
IZ
630 }
631 }
632 SvCUR_set(data->last_found, 0);
0eda9292 633 {
a28509cc 634 SV * const sv = data->last_found;
097eb12c
AL
635 if (SvUTF8(sv) && SvMAGICAL(sv)) {
636 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
637 if (mg)
638 mg->mg_len = 0;
639 }
0eda9292 640 }
c277df42
IZ
641 data->last_end = -1;
642 data->flags &= ~SF_BEFORE_EOL;
1de06328 643 DEBUG_STUDYDATA(data,0);
c277df42
IZ
644}
645
653099ff
GS
646/* Can match anything (initialization) */
647STATIC void
097eb12c 648S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 649{
653099ff 650 ANYOF_CLASS_ZERO(cl);
f8bef550 651 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 652 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
653 if (LOC)
654 cl->flags |= ANYOF_LOCALE;
655}
656
657/* Can match anything (initialization) */
658STATIC int
5f66b61c 659S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
660{
661 int value;
662
aaa51d5e 663 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
664 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
665 return 1;
1aa99e6b
IH
666 if (!(cl->flags & ANYOF_UNICODE_ALL))
667 return 0;
10edeb5d 668 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 669 return 0;
653099ff
GS
670 return 1;
671}
672
673/* Can match anything (initialization) */
674STATIC void
097eb12c 675S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 676{
8ecf7187 677 Zero(cl, 1, struct regnode_charclass_class);
653099ff 678 cl->type = ANYOF;
830247a4 679 cl_anything(pRExC_state, cl);
653099ff
GS
680}
681
682STATIC void
097eb12c 683S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 684{
8ecf7187 685 Zero(cl, 1, struct regnode_charclass_class);
653099ff 686 cl->type = ANYOF;
830247a4 687 cl_anything(pRExC_state, cl);
653099ff
GS
688 if (LOC)
689 cl->flags |= ANYOF_LOCALE;
690}
691
692/* 'And' a given class with another one. Can create false positives */
693/* We assume that cl is not inverted */
694STATIC void
5f66b61c 695S_cl_and(struct regnode_charclass_class *cl,
a28509cc 696 const struct regnode_charclass_class *and_with)
653099ff 697{
40d049e4
YO
698
699 assert(and_with->type == ANYOF);
653099ff
GS
700 if (!(and_with->flags & ANYOF_CLASS)
701 && !(cl->flags & ANYOF_CLASS)
702 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
703 && !(and_with->flags & ANYOF_FOLD)
704 && !(cl->flags & ANYOF_FOLD)) {
705 int i;
706
707 if (and_with->flags & ANYOF_INVERT)
708 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
709 cl->bitmap[i] &= ~and_with->bitmap[i];
710 else
711 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
712 cl->bitmap[i] &= and_with->bitmap[i];
713 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
714 if (!(and_with->flags & ANYOF_EOS))
715 cl->flags &= ~ANYOF_EOS;
1aa99e6b 716
14ebb1a2
JH
717 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
718 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
719 cl->flags &= ~ANYOF_UNICODE_ALL;
720 cl->flags |= ANYOF_UNICODE;
721 ARG_SET(cl, ARG(and_with));
722 }
14ebb1a2
JH
723 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
724 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 725 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
726 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
727 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 728 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
729}
730
731/* 'OR' a given class with another one. Can create false positives */
732/* We assume that cl is not inverted */
733STATIC void
097eb12c 734S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 735{
653099ff
GS
736 if (or_with->flags & ANYOF_INVERT) {
737 /* We do not use
738 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
739 * <= (B1 | !B2) | (CL1 | !CL2)
740 * which is wasteful if CL2 is small, but we ignore CL2:
741 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
742 * XXXX Can we handle case-fold? Unclear:
743 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
744 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
745 */
746 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
747 && !(or_with->flags & ANYOF_FOLD)
748 && !(cl->flags & ANYOF_FOLD) ) {
749 int i;
750
751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 cl->bitmap[i] |= ~or_with->bitmap[i];
753 } /* XXXX: logic is complicated otherwise */
754 else {
830247a4 755 cl_anything(pRExC_state, cl);
653099ff
GS
756 }
757 } else {
758 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
759 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 760 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
761 || (cl->flags & ANYOF_FOLD)) ) {
762 int i;
763
764 /* OR char bitmap and class bitmap separately */
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] |= or_with->bitmap[i];
767 if (or_with->flags & ANYOF_CLASS) {
768 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
769 cl->classflags[i] |= or_with->classflags[i];
770 cl->flags |= ANYOF_CLASS;
771 }
772 }
773 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 774 cl_anything(pRExC_state, cl);
653099ff
GS
775 }
776 }
777 if (or_with->flags & ANYOF_EOS)
778 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
779
780 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
781 ARG(cl) != ARG(or_with)) {
782 cl->flags |= ANYOF_UNICODE_ALL;
783 cl->flags &= ~ANYOF_UNICODE;
784 }
785 if (or_with->flags & ANYOF_UNICODE_ALL) {
786 cl->flags |= ANYOF_UNICODE_ALL;
787 cl->flags &= ~ANYOF_UNICODE;
788 }
653099ff
GS
789}
790
a3621e74
YO
791#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
792#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
793#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
794#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
795
3dab1dad
YO
796
797#ifdef DEBUGGING
07be1b83 798/*
3dab1dad
YO
799 dump_trie(trie)
800 dump_trie_interim_list(trie,next_alloc)
801 dump_trie_interim_table(trie,next_alloc)
802
803 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
804 The _interim_ variants are used for debugging the interim
805 tables that are used to generate the final compressed
806 representation which is what dump_trie expects.
807
3dab1dad
YO
808 Part of the reason for their existance is to provide a form
809 of documentation as to how the different representations function.
07be1b83
YO
810
811*/
3dab1dad
YO
812
813/*
814 dump_trie(trie)
815 Dumps the final compressed table form of the trie to Perl_debug_log.
816 Used for debugging make_trie().
817*/
818
819STATIC void
820S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
821{
822 U32 state;
ab3bbdeb
YO
823 SV *sv=sv_newmortal();
824 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
825 GET_RE_DEBUG_FLAGS_DECL;
826
ab3bbdeb 827
3dab1dad
YO
828 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
829 (int)depth * 2 + 2,"",
830 "Match","Base","Ofs" );
831
832 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 833 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 834 if ( tmp ) {
ab3bbdeb
YO
835 PerlIO_printf( Perl_debug_log, "%*s",
836 colwidth,
ddc5bc0f 837 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
838 PL_colors[0], PL_colors[1],
839 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
840 PERL_PV_ESCAPE_FIRSTCHAR
841 )
842 );
3dab1dad
YO
843 }
844 }
845 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
846 (int)depth * 2 + 2,"");
847
848 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 849 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
850 PerlIO_printf( Perl_debug_log, "\n");
851
1e2e3d02 852 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 853 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
854
855 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
856
857 if ( trie->states[ state ].wordnum ) {
858 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
859 } else {
860 PerlIO_printf( Perl_debug_log, "%6s", "" );
861 }
862
863 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
864
865 if ( base ) {
866 U32 ofs = 0;
867
868 while( ( base + ofs < trie->uniquecharcount ) ||
869 ( base + ofs - trie->uniquecharcount < trie->lasttrans
870 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
871 ofs++;
872
873 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
874
875 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
876 if ( ( base + ofs >= trie->uniquecharcount ) &&
877 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
878 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
879 {
ab3bbdeb
YO
880 PerlIO_printf( Perl_debug_log, "%*"UVXf,
881 colwidth,
3dab1dad
YO
882 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
883 } else {
ab3bbdeb 884 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
885 }
886 }
887
888 PerlIO_printf( Perl_debug_log, "]");
889
890 }
891 PerlIO_printf( Perl_debug_log, "\n" );
892 }
893}
894/*
895 dump_trie_interim_list(trie,next_alloc)
896 Dumps a fully constructed but uncompressed trie in list form.
897 List tries normally only are used for construction when the number of
898 possible chars (trie->uniquecharcount) is very high.
899 Used for debugging make_trie().
900*/
901STATIC void
902S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
903{
904 U32 state;
ab3bbdeb
YO
905 SV *sv=sv_newmortal();
906 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
907 GET_RE_DEBUG_FLAGS_DECL;
908 /* print out the table precompression. */
ab3bbdeb
YO
909 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
910 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
911 "------:-----+-----------------\n" );
3dab1dad
YO
912
913 for( state=1 ; state < next_alloc ; state ++ ) {
914 U16 charid;
915
ab3bbdeb 916 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
917 (int)depth * 2 + 2,"", (UV)state );
918 if ( ! trie->states[ state ].wordnum ) {
919 PerlIO_printf( Perl_debug_log, "%5s| ","");
920 } else {
921 PerlIO_printf( Perl_debug_log, "W%4x| ",
922 trie->states[ state ].wordnum
923 );
924 }
925 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 926 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
927 if ( tmp ) {
928 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
929 colwidth,
ddc5bc0f 930 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
931 PL_colors[0], PL_colors[1],
932 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
933 PERL_PV_ESCAPE_FIRSTCHAR
934 ) ,
1e2e3d02
YO
935 TRIE_LIST_ITEM(state,charid).forid,
936 (UV)TRIE_LIST_ITEM(state,charid).newstate
937 );
938 if (!(charid % 10))
664e119d
RGS
939 PerlIO_printf(Perl_debug_log, "\n%*s| ",
940 (int)((depth * 2) + 14), "");
1e2e3d02 941 }
ab3bbdeb
YO
942 }
943 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
944 }
945}
946
947/*
948 dump_trie_interim_table(trie,next_alloc)
949 Dumps a fully constructed but uncompressed trie in table form.
950 This is the normal DFA style state transition table, with a few
951 twists to facilitate compression later.
952 Used for debugging make_trie().
953*/
954STATIC void
955S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
956{
957 U32 state;
958 U16 charid;
ab3bbdeb
YO
959 SV *sv=sv_newmortal();
960 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
961 GET_RE_DEBUG_FLAGS_DECL;
962
963 /*
964 print out the table precompression so that we can do a visual check
965 that they are identical.
966 */
967
968 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
969
970 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 971 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 972 if ( tmp ) {
ab3bbdeb
YO
973 PerlIO_printf( Perl_debug_log, "%*s",
974 colwidth,
ddc5bc0f 975 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
976 PL_colors[0], PL_colors[1],
977 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
978 PERL_PV_ESCAPE_FIRSTCHAR
979 )
980 );
3dab1dad
YO
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 987 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
999 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1000 if (v)
1001 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1002 else
1003 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1004 }
1005 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1006 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1007 } else {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1009 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1010 }
1011 }
07be1b83 1012}
3dab1dad
YO
1013
1014#endif
1015
786e8c11
YO
1016/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1017 startbranch: the first branch in the whole branch sequence
1018 first : start branch of sequence of branch-exact nodes.
1019 May be the same as startbranch
1020 last : Thing following the last branch.
1021 May be the same as tail.
1022 tail : item following the branch sequence
1023 count : words in the sequence
1024 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1025 depth : indent depth
3dab1dad 1026
786e8c11 1027Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1028
786e8c11
YO
1029A trie is an N'ary tree where the branches are determined by digital
1030decomposition of the key. IE, at the root node you look up the 1st character and
1031follow that branch repeat until you find the end of the branches. Nodes can be
1032marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1033
786e8c11 1034 /he|she|his|hers/
72f13be8 1035
786e8c11
YO
1036would convert into the following structure. Numbers represent states, letters
1037following numbers represent valid transitions on the letter from that state, if
1038the number is in square brackets it represents an accepting state, otherwise it
1039will be in parenthesis.
07be1b83 1040
786e8c11
YO
1041 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1042 | |
1043 | (2)
1044 | |
1045 (1) +-i->(6)-+-s->[7]
1046 |
1047 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1048
786e8c11
YO
1049 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1050
1051This shows that when matching against the string 'hers' we will begin at state 1
1052read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1053then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1054is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1055single traverse. We store a mapping from accepting to state to which word was
1056matched, and then when we have multiple possibilities we try to complete the
1057rest of the regex in the order in which they occured in the alternation.
1058
1059The only prior NFA like behaviour that would be changed by the TRIE support is
1060the silent ignoring of duplicate alternations which are of the form:
1061
1062 / (DUPE|DUPE) X? (?{ ... }) Y /x
1063
1064Thus EVAL blocks follwing a trie may be called a different number of times with
1065and without the optimisation. With the optimisations dupes will be silently
1066ignored. This inconsistant behaviour of EVAL type nodes is well established as
1067the following demonstrates:
1068
1069 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1070
1071which prints out 'word' three times, but
1072
1073 'words'=~/(word|word|word)(?{ print $1 })S/
1074
1075which doesnt print it out at all. This is due to other optimisations kicking in.
1076
1077Example of what happens on a structural level:
1078
1079The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1080
1081 1: CURLYM[1] {1,32767}(18)
1082 5: BRANCH(8)
1083 6: EXACT <ac>(16)
1084 8: BRANCH(11)
1085 9: EXACT <ad>(16)
1086 11: BRANCH(14)
1087 12: EXACT <ab>(16)
1088 16: SUCCEED(0)
1089 17: NOTHING(18)
1090 18: END(0)
1091
1092This would be optimizable with startbranch=5, first=5, last=16, tail=16
1093and should turn into:
1094
1095 1: CURLYM[1] {1,32767}(18)
1096 5: TRIE(16)
1097 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1098 <ac>
1099 <ad>
1100 <ab>
1101 16: SUCCEED(0)
1102 17: NOTHING(18)
1103 18: END(0)
1104
1105Cases where tail != last would be like /(?foo|bar)baz/:
1106
1107 1: BRANCH(4)
1108 2: EXACT <foo>(8)
1109 4: BRANCH(7)
1110 5: EXACT <bar>(8)
1111 7: TAIL(8)
1112 8: EXACT <baz>(10)
1113 10: END(0)
1114
1115which would be optimizable with startbranch=1, first=1, last=7, tail=8
1116and would end up looking like:
1117
1118 1: TRIE(8)
1119 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1120 <foo>
1121 <bar>
1122 7: TAIL(8)
1123 8: EXACT <baz>(10)
1124 10: END(0)
1125
1126 d = uvuni_to_utf8_flags(d, uv, 0);
1127
1128is the recommended Unicode-aware way of saying
1129
1130 *(d++) = uv;
1131*/
1132
1e2e3d02 1133#define TRIE_STORE_REVCHAR \
786e8c11 1134 STMT_START { \
1e2e3d02 1135 SV *tmp = newSVpvs(""); \
786e8c11 1136 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1137 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1138 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1139 } STMT_END
1140
1141#define TRIE_READ_CHAR STMT_START { \
1142 wordlen++; \
1143 if ( UTF ) { \
1144 if ( folder ) { \
1145 if ( foldlen > 0 ) { \
1146 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1147 foldlen -= len; \
1148 scan += len; \
1149 len = 0; \
1150 } else { \
1151 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1152 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1153 foldlen -= UNISKIP( uvc ); \
1154 scan = foldbuf + UNISKIP( uvc ); \
1155 } \
1156 } else { \
1157 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1158 } \
1159 } else { \
1160 uvc = (U32)*uc; \
1161 len = 1; \
1162 } \
1163} STMT_END
1164
1165
1166
1167#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1168 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1169 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1170 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1171 } \
1172 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1173 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1174 TRIE_LIST_CUR( state )++; \
1175} STMT_END
07be1b83 1176
786e8c11
YO
1177#define TRIE_LIST_NEW(state) STMT_START { \
1178 Newxz( trie->states[ state ].trans.list, \
1179 4, reg_trie_trans_le ); \
1180 TRIE_LIST_CUR( state ) = 1; \
1181 TRIE_LIST_LEN( state ) = 4; \
1182} STMT_END
07be1b83 1183
786e8c11
YO
1184#define TRIE_HANDLE_WORD(state) STMT_START { \
1185 U16 dupe= trie->states[ state ].wordnum; \
1186 regnode * const noper_next = regnext( noper ); \
1187 \
1188 if (trie->wordlen) \
1189 trie->wordlen[ curword ] = wordlen; \
1190 DEBUG_r({ \
1191 /* store the word for dumping */ \
1192 SV* tmp; \
1193 if (OP(noper) != NOTHING) \
1194 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1195 else \
1196 tmp = newSVpvn( "", 0 ); \
1197 if ( UTF ) SvUTF8_on( tmp ); \
1198 av_push( trie->words, tmp ); \
1199 }); \
1200 \
1201 curword++; \
1202 \
1203 if ( noper_next < tail ) { \
1204 if (!trie->jump) \
1205 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1206 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1207 if (!jumper) \
1208 jumper = noper_next; \
1209 if (!nextbranch) \
1210 nextbranch= regnext(cur); \
1211 } \
1212 \
1213 if ( dupe ) { \
1214 /* So it's a dupe. This means we need to maintain a */\
1215 /* linked-list from the first to the next. */\
1216 /* we only allocate the nextword buffer when there */\
1217 /* a dupe, so first time we have to do the allocation */\
1218 if (!trie->nextword) \
1219 Newxz( trie->nextword, word_count + 1, U16); \
1220 while ( trie->nextword[dupe] ) \
1221 dupe= trie->nextword[dupe]; \
1222 trie->nextword[dupe]= curword; \
1223 } else { \
1224 /* we haven't inserted this word yet. */ \
1225 trie->states[ state ].wordnum = curword; \
1226 } \
1227} STMT_END
07be1b83 1228
3dab1dad 1229
786e8c11
YO
1230#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1231 ( ( base + charid >= ucharcount \
1232 && base + charid < ubound \
1233 && state == trie->trans[ base - ucharcount + charid ].check \
1234 && trie->trans[ base - ucharcount + charid ].next ) \
1235 ? trie->trans[ base - ucharcount + charid ].next \
1236 : ( state==1 ? special : 0 ) \
1237 )
3dab1dad 1238
786e8c11
YO
1239#define MADE_TRIE 1
1240#define MADE_JUMP_TRIE 2
1241#define MADE_EXACT_TRIE 4
3dab1dad 1242
a3621e74 1243STATIC I32
786e8c11 1244S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1245{
27da23d5 1246 dVAR;
a3621e74
YO
1247 /* first pass, loop through and scan words */
1248 reg_trie_data *trie;
1249 regnode *cur;
9f7f3913 1250 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1251 STRLEN len = 0;
1252 UV uvc = 0;
1253 U16 curword = 0;
1254 U32 next_alloc = 0;
786e8c11
YO
1255 regnode *jumper = NULL;
1256 regnode *nextbranch = NULL;
7f69552c 1257 regnode *convert = NULL;
a3621e74 1258 /* we just use folder as a flag in utf8 */
e1ec3a88 1259 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1260 ? PL_fold
1261 : ( flags == EXACTFL
1262 ? PL_fold_locale
1263 : NULL
1264 )
1265 );
1266
e1ec3a88 1267 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1268 SV *re_trie_maxbuff;
3dab1dad
YO
1269#ifndef DEBUGGING
1270 /* these are only used during construction but are useful during
8e11feef 1271 * debugging so we store them in the struct when debugging.
8e11feef 1272 */
3dab1dad 1273 STRLEN trie_charcount=0;
3dab1dad
YO
1274 AV *trie_revcharmap;
1275#endif
a3621e74 1276 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1277#ifndef DEBUGGING
1278 PERL_UNUSED_ARG(depth);
1279#endif
a3621e74 1280
a02a5408 1281 Newxz( trie, 1, reg_trie_data );
a3621e74 1282 trie->refcount = 1;
3dab1dad 1283 trie->startstate = 1;
786e8c11 1284 trie->wordcount = word_count;
a3621e74 1285 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1286 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1287 if (!(UTF && folder))
1288 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1289 DEBUG_r({
1290 trie->words = newAV();
a3621e74 1291 });
3dab1dad 1292 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1293
0111c4fd 1294 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1295 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1296 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1297 }
3dab1dad
YO
1298 DEBUG_OPTIMISE_r({
1299 PerlIO_printf( Perl_debug_log,
786e8c11 1300 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1301 (int)depth * 2 + 2, "",
1302 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1303 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1304 (int)depth);
3dab1dad 1305 });
7f69552c
YO
1306
1307 /* Find the node we are going to overwrite */
1308 if ( first == startbranch && OP( last ) != BRANCH ) {
1309 /* whole branch chain */
1310 convert = first;
1311 } else {
1312 /* branch sub-chain */
1313 convert = NEXTOPER( first );
1314 }
1315
a3621e74
YO
1316 /* -- First loop and Setup --
1317
1318 We first traverse the branches and scan each word to determine if it
1319 contains widechars, and how many unique chars there are, this is
1320 important as we have to build a table with at least as many columns as we
1321 have unique chars.
1322
1323 We use an array of integers to represent the character codes 0..255
1324 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1325 native representation of the character value as the key and IV's for the
1326 coded index.
1327
1328 *TODO* If we keep track of how many times each character is used we can
1329 remap the columns so that the table compression later on is more
1330 efficient in terms of memory by ensuring most common value is in the
1331 middle and the least common are on the outside. IMO this would be better
1332 than a most to least common mapping as theres a decent chance the most
1333 common letter will share a node with the least common, meaning the node
1334 will not be compressable. With a middle is most common approach the worst
1335 case is when we have the least common nodes twice.
1336
1337 */
1338
a3621e74 1339 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1340 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1341 const U8 *uc = (U8*)STRING( noper );
a28509cc 1342 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1343 STRLEN foldlen = 0;
1344 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1345 const U8 *scan = (U8*)NULL;
07be1b83 1346 U32 wordlen = 0; /* required init */
3dab1dad 1347 STRLEN chars=0;
a3621e74 1348
3dab1dad
YO
1349 if (OP(noper) == NOTHING) {
1350 trie->minlen= 0;
1351 continue;
1352 }
1353 if (trie->bitmap) {
1354 TRIE_BITMAP_SET(trie,*uc);
1355 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1356 }
a3621e74 1357 for ( ; uc < e ; uc += len ) {
3dab1dad 1358 TRIE_CHARCOUNT(trie)++;
a3621e74 1359 TRIE_READ_CHAR;
3dab1dad 1360 chars++;
a3621e74
YO
1361 if ( uvc < 256 ) {
1362 if ( !trie->charmap[ uvc ] ) {
1363 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1364 if ( folder )
1365 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1366 TRIE_STORE_REVCHAR;
a3621e74
YO
1367 }
1368 } else {
1369 SV** svpp;
1370 if ( !trie->widecharmap )
1371 trie->widecharmap = newHV();
1372
1373 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1374
1375 if ( !svpp )
e4584336 1376 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1377
1378 if ( !SvTRUE( *svpp ) ) {
1379 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1380 TRIE_STORE_REVCHAR;
a3621e74
YO
1381 }
1382 }
1383 }
3dab1dad
YO
1384 if( cur == first ) {
1385 trie->minlen=chars;
1386 trie->maxlen=chars;
1387 } else if (chars < trie->minlen) {
1388 trie->minlen=chars;
1389 } else if (chars > trie->maxlen) {
1390 trie->maxlen=chars;
1391 }
1392
a3621e74
YO
1393 } /* end first pass */
1394 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1395 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1396 (int)depth * 2 + 2,"",
85c3142d 1397 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1398 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1399 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1400 );
786e8c11 1401 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1402
1403 /*
1404 We now know what we are dealing with in terms of unique chars and
1405 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1406 representation using a flat table will take. If it's over a reasonable
1407 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1408 conservative but potentially much slower representation using an array
1409 of lists.
1410
1411 At the end we convert both representations into the same compressed
1412 form that will be used in regexec.c for matching with. The latter
1413 is a form that cannot be used to construct with but has memory
1414 properties similar to the list form and access properties similar
1415 to the table form making it both suitable for fast searches and
1416 small enough that its feasable to store for the duration of a program.
1417
1418 See the comment in the code where the compressed table is produced
1419 inplace from the flat tabe representation for an explanation of how
1420 the compression works.
1421
1422 */
1423
1424
3dab1dad 1425 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1426 /*
1427 Second Pass -- Array Of Lists Representation
1428
1429 Each state will be represented by a list of charid:state records
1430 (reg_trie_trans_le) the first such element holds the CUR and LEN
1431 points of the allocated array. (See defines above).
1432
1433 We build the initial structure using the lists, and then convert
1434 it into the compressed table form which allows faster lookups
1435 (but cant be modified once converted).
a3621e74
YO
1436 */
1437
a3621e74
YO
1438 STRLEN transcount = 1;
1439
1e2e3d02
YO
1440 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1441 "%*sCompiling trie using list compiler\n",
1442 (int)depth * 2 + 2, ""));
1443
3dab1dad 1444 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1445 TRIE_LIST_NEW(1);
1446 next_alloc = 2;
1447
1448 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1449
c445ea15
AL
1450 regnode * const noper = NEXTOPER( cur );
1451 U8 *uc = (U8*)STRING( noper );
1452 const U8 * const e = uc + STR_LEN( noper );
1453 U32 state = 1; /* required init */
1454 U16 charid = 0; /* sanity init */
1455 U8 *scan = (U8*)NULL; /* sanity init */
1456 STRLEN foldlen = 0; /* required init */
07be1b83 1457 U32 wordlen = 0; /* required init */
c445ea15
AL
1458 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459
3dab1dad 1460 if (OP(noper) != NOTHING) {
786e8c11 1461 for ( ; uc < e ; uc += len ) {
c445ea15 1462
786e8c11 1463 TRIE_READ_CHAR;
c445ea15 1464
786e8c11
YO
1465 if ( uvc < 256 ) {
1466 charid = trie->charmap[ uvc ];
c445ea15 1467 } else {
786e8c11
YO
1468 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1469 if ( !svpp ) {
1470 charid = 0;
1471 } else {
1472 charid=(U16)SvIV( *svpp );
1473 }
c445ea15 1474 }
786e8c11
YO
1475 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1476 if ( charid ) {
a3621e74 1477
786e8c11
YO
1478 U16 check;
1479 U32 newstate = 0;
a3621e74 1480
786e8c11
YO
1481 charid--;
1482 if ( !trie->states[ state ].trans.list ) {
1483 TRIE_LIST_NEW( state );
c445ea15 1484 }
786e8c11
YO
1485 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1486 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1487 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1488 break;
1489 }
1490 }
1491 if ( ! newstate ) {
1492 newstate = next_alloc++;
1493 TRIE_LIST_PUSH( state, charid, newstate );
1494 transcount++;
1495 }
1496 state = newstate;
1497 } else {
1498 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1499 }
a28509cc 1500 }
c445ea15 1501 }
3dab1dad 1502 TRIE_HANDLE_WORD(state);
a3621e74
YO
1503
1504 } /* end second pass */
1505
1e2e3d02
YO
1506 /* next alloc is the NEXT state to be allocated */
1507 trie->statecount = next_alloc;
a3621e74
YO
1508 Renew( trie->states, next_alloc, reg_trie_state );
1509
3dab1dad
YO
1510 /* and now dump it out before we compress it */
1511 DEBUG_TRIE_COMPILE_MORE_r(
1512 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1513 );
a3621e74 1514
a02a5408 1515 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1516 {
1517 U32 state;
a3621e74
YO
1518 U32 tp = 0;
1519 U32 zp = 0;
1520
1521
1522 for( state=1 ; state < next_alloc ; state ++ ) {
1523 U32 base=0;
1524
1525 /*
1526 DEBUG_TRIE_COMPILE_MORE_r(
1527 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1528 );
1529 */
1530
1531 if (trie->states[state].trans.list) {
1532 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1533 U16 maxid=minid;
a28509cc 1534 U16 idx;
a3621e74
YO
1535
1536 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1537 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1538 if ( forid < minid ) {
1539 minid=forid;
1540 } else if ( forid > maxid ) {
1541 maxid=forid;
1542 }
a3621e74
YO
1543 }
1544 if ( transcount < tp + maxid - minid + 1) {
1545 transcount *= 2;
1546 Renew( trie->trans, transcount, reg_trie_trans );
1547 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1548 }
1549 base = trie->uniquecharcount + tp - minid;
1550 if ( maxid == minid ) {
1551 U32 set = 0;
1552 for ( ; zp < tp ; zp++ ) {
1553 if ( ! trie->trans[ zp ].next ) {
1554 base = trie->uniquecharcount + zp - minid;
1555 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556 trie->trans[ zp ].check = state;
1557 set = 1;
1558 break;
1559 }
1560 }
1561 if ( !set ) {
1562 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1563 trie->trans[ tp ].check = state;
1564 tp++;
1565 zp = tp;
1566 }
1567 } else {
1568 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1569 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1570 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1571 trie->trans[ tid ].check = state;
1572 }
1573 tp += ( maxid - minid + 1 );
1574 }
1575 Safefree(trie->states[ state ].trans.list);
1576 }
1577 /*
1578 DEBUG_TRIE_COMPILE_MORE_r(
1579 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1580 );
1581 */
1582 trie->states[ state ].trans.base=base;
1583 }
cc601c31 1584 trie->lasttrans = tp + 1;
a3621e74
YO
1585 }
1586 } else {
1587 /*
1588 Second Pass -- Flat Table Representation.
1589
1590 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1591 We know that we will need Charcount+1 trans at most to store the data
1592 (one row per char at worst case) So we preallocate both structures
1593 assuming worst case.
1594
1595 We then construct the trie using only the .next slots of the entry
1596 structs.
1597
1598 We use the .check field of the first entry of the node temporarily to
1599 make compression both faster and easier by keeping track of how many non
1600 zero fields are in the node.
1601
1602 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1603 transition.
1604
1605 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1606 number representing the first entry of the node, and state as a
1607 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1608 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1609 are 2 entrys per node. eg:
1610
1611 A B A B
1612 1. 2 4 1. 3 7
1613 2. 0 3 3. 0 5
1614 3. 0 0 5. 0 0
1615 4. 0 0 7. 0 0
1616
1617 The table is internally in the right hand, idx form. However as we also
1618 have to deal with the states array which is indexed by nodenum we have to
1619 use TRIE_NODENUM() to convert.
1620
1621 */
1e2e3d02
YO
1622 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1623 "%*sCompiling trie using table compiler\n",
1624 (int)depth * 2 + 2, ""));
3dab1dad
YO
1625
1626 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1627 reg_trie_trans );
3dab1dad 1628 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1629 next_alloc = trie->uniquecharcount + 1;
1630
3dab1dad 1631
a3621e74
YO
1632 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1633
c445ea15 1634 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1635 const U8 *uc = (U8*)STRING( noper );
1636 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1637
1638 U32 state = 1; /* required init */
1639
1640 U16 charid = 0; /* sanity init */
1641 U32 accept_state = 0; /* sanity init */
1642 U8 *scan = (U8*)NULL; /* sanity init */
1643
1644 STRLEN foldlen = 0; /* required init */
07be1b83 1645 U32 wordlen = 0; /* required init */
a3621e74
YO
1646 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1647
3dab1dad 1648 if ( OP(noper) != NOTHING ) {
786e8c11 1649 for ( ; uc < e ; uc += len ) {
a3621e74 1650
786e8c11 1651 TRIE_READ_CHAR;
a3621e74 1652
786e8c11
YO
1653 if ( uvc < 256 ) {
1654 charid = trie->charmap[ uvc ];
1655 } else {
1656 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1657 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1658 }
786e8c11
YO
1659 if ( charid ) {
1660 charid--;
1661 if ( !trie->trans[ state + charid ].next ) {
1662 trie->trans[ state + charid ].next = next_alloc;
1663 trie->trans[ state ].check++;
1664 next_alloc += trie->uniquecharcount;
1665 }
1666 state = trie->trans[ state + charid ].next;
1667 } else {
1668 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1669 }
1670 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1671 }
a3621e74 1672 }
3dab1dad
YO
1673 accept_state = TRIE_NODENUM( state );
1674 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1675
1676 } /* end second pass */
1677
3dab1dad
YO
1678 /* and now dump it out before we compress it */
1679 DEBUG_TRIE_COMPILE_MORE_r(
1680 dump_trie_interim_table(trie,next_alloc,depth+1)
1681 );
a3621e74 1682
a3621e74
YO
1683 {
1684 /*
1685 * Inplace compress the table.*
1686
1687 For sparse data sets the table constructed by the trie algorithm will
1688 be mostly 0/FAIL transitions or to put it another way mostly empty.
1689 (Note that leaf nodes will not contain any transitions.)
1690
1691 This algorithm compresses the tables by eliminating most such
1692 transitions, at the cost of a modest bit of extra work during lookup:
1693
1694 - Each states[] entry contains a .base field which indicates the
1695 index in the state[] array wheres its transition data is stored.
1696
1697 - If .base is 0 there are no valid transitions from that node.
1698
1699 - If .base is nonzero then charid is added to it to find an entry in
1700 the trans array.
1701
1702 -If trans[states[state].base+charid].check!=state then the
1703 transition is taken to be a 0/Fail transition. Thus if there are fail
1704 transitions at the front of the node then the .base offset will point
1705 somewhere inside the previous nodes data (or maybe even into a node
1706 even earlier), but the .check field determines if the transition is
1707 valid.
1708
786e8c11 1709 XXX - wrong maybe?
a3621e74
YO
1710 The following process inplace converts the table to the compressed
1711 table: We first do not compress the root node 1,and mark its all its
1712 .check pointers as 1 and set its .base pointer as 1 as well. This
1713 allows to do a DFA construction from the compressed table later, and
1714 ensures that any .base pointers we calculate later are greater than
1715 0.
1716
1717 - We set 'pos' to indicate the first entry of the second node.
1718
1719 - We then iterate over the columns of the node, finding the first and
1720 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1721 and set the .check pointers accordingly, and advance pos
1722 appropriately and repreat for the next node. Note that when we copy
1723 the next pointers we have to convert them from the original
1724 NODEIDX form to NODENUM form as the former is not valid post
1725 compression.
1726
1727 - If a node has no transitions used we mark its base as 0 and do not
1728 advance the pos pointer.
1729
1730 - If a node only has one transition we use a second pointer into the
1731 structure to fill in allocated fail transitions from other states.
1732 This pointer is independent of the main pointer and scans forward
1733 looking for null transitions that are allocated to a state. When it
1734 finds one it writes the single transition into the "hole". If the
786e8c11 1735 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1736
1737 - Once compressed we can Renew/realloc the structures to release the
1738 excess space.
1739
1740 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1741 specifically Fig 3.47 and the associated pseudocode.
1742
1743 demq
1744 */
a3b680e6 1745 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1746 U32 state, charid;
a3621e74 1747 U32 pos = 0, zp=0;
1e2e3d02 1748 trie->statecount = laststate;
a3621e74
YO
1749
1750 for ( state = 1 ; state < laststate ; state++ ) {
1751 U8 flag = 0;
a28509cc
AL
1752 const U32 stateidx = TRIE_NODEIDX( state );
1753 const U32 o_used = trie->trans[ stateidx ].check;
1754 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1755 trie->trans[ stateidx ].check = 0;
1756
1757 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1758 if ( flag || trie->trans[ stateidx + charid ].next ) {
1759 if ( trie->trans[ stateidx + charid ].next ) {
1760 if (o_used == 1) {
1761 for ( ; zp < pos ; zp++ ) {
1762 if ( ! trie->trans[ zp ].next ) {
1763 break;
1764 }
1765 }
1766 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1767 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ zp ].check = state;
1769 if ( ++zp > pos ) pos = zp;
1770 break;
1771 }
1772 used--;
1773 }
1774 if ( !flag ) {
1775 flag = 1;
1776 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1777 }
1778 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ pos ].check = state;
1780 pos++;
1781 }
1782 }
1783 }
cc601c31 1784 trie->lasttrans = pos + 1;
1e2e3d02 1785 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1786 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1787 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1788 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1789 (int)depth * 2 + 2,"",
1790 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1791 (IV)next_alloc,
1792 (IV)pos,
a3621e74
YO
1793 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1794 );
1795
1796 } /* end table compress */
1797 }
1e2e3d02
YO
1798 DEBUG_TRIE_COMPILE_MORE_r(
1799 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1800 (int)depth * 2 + 2, "",
1801 (UV)trie->statecount,
1802 (UV)trie->lasttrans)
1803 );
cc601c31
YO
1804 /* resize the trans array to remove unused space */
1805 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1806
3dab1dad
YO
1807 /* and now dump out the compressed format */
1808 DEBUG_TRIE_COMPILE_r(
1809 dump_trie(trie,depth+1)
1810 );
07be1b83 1811
3dab1dad 1812 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1813 U8 nodetype =(U8)(flags & 0xFF);
1814 char *str=NULL;
786e8c11 1815
07be1b83 1816#ifdef DEBUGGING
e62cc96a 1817 regnode *optimize = NULL;
b57a0404
JH
1818 U32 mjd_offset = 0;
1819 U32 mjd_nodelen = 0;
07be1b83 1820#endif
a3621e74 1821 /*
3dab1dad
YO
1822 This means we convert either the first branch or the first Exact,
1823 depending on whether the thing following (in 'last') is a branch
1824 or not and whther first is the startbranch (ie is it a sub part of
1825 the alternation or is it the whole thing.)
1826 Assuming its a sub part we conver the EXACT otherwise we convert
1827 the whole branch sequence, including the first.
a3621e74 1828 */
3dab1dad 1829 /* Find the node we are going to overwrite */
7f69552c 1830 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1831 /* branch sub-chain */
3dab1dad 1832 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1833 DEBUG_r({
1834 mjd_offset= Node_Offset((convert));
1835 mjd_nodelen= Node_Length((convert));
1836 });
7f69552c
YO
1837 /* whole branch chain */
1838 } else {
1839 DEBUG_r({
1840 const regnode *nop = NEXTOPER( convert );
1841 mjd_offset= Node_Offset((nop));
1842 mjd_nodelen= Node_Length((nop));
1843 });
07be1b83 1844 }
7f69552c 1845
07be1b83
YO
1846 DEBUG_OPTIMISE_r(
1847 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1848 (int)depth * 2 + 2, "",
786e8c11 1849 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1850 );
a3621e74 1851
3dab1dad
YO
1852 /* But first we check to see if there is a common prefix we can
1853 split out as an EXACT and put in front of the TRIE node. */
1854 trie->startstate= 1;
786e8c11 1855 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1856 U32 state;
1e2e3d02 1857 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1858 U32 ofs = 0;
8e11feef
RGS
1859 I32 idx = -1;
1860 U32 count = 0;
1861 const U32 base = trie->states[ state ].trans.base;
a3621e74 1862
3dab1dad 1863 if ( trie->states[state].wordnum )
8e11feef 1864 count = 1;
a3621e74 1865
8e11feef 1866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1867 if ( ( base + ofs >= trie->uniquecharcount ) &&
1868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1870 {
3dab1dad 1871 if ( ++count > 1 ) {
8e11feef 1872 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1873 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1874 if ( state == 1 ) break;
3dab1dad
YO
1875 if ( count == 2 ) {
1876 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1877 DEBUG_OPTIMISE_r(
8e11feef
RGS
1878 PerlIO_printf(Perl_debug_log,
1879 "%*sNew Start State=%"UVuf" Class: [",
1880 (int)depth * 2 + 2, "",
786e8c11 1881 (UV)state));
be8e71aa
YO
1882 if (idx >= 0) {
1883 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1884 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1885
3dab1dad 1886 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1887 if ( folder )
1888 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1889 DEBUG_OPTIMISE_r(
07be1b83 1890 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1891 );
8e11feef
RGS
1892 }
1893 }
1894 TRIE_BITMAP_SET(trie,*ch);
1895 if ( folder )
1896 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1897 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1898 }
1899 idx = ofs;
1900 }
3dab1dad
YO
1901 }
1902 if ( count == 1 ) {
1903 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1904 char *ch = SvPV_nolen( *tmp );
1905 DEBUG_OPTIMISE_r({
1906 SV *sv=sv_newmortal();
8e11feef
RGS
1907 PerlIO_printf( Perl_debug_log,
1908 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1909 (int)depth * 2 + 2, "",
de734bd5
A
1910 (UV)state, (UV)idx,
1911 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1912 PL_colors[0], PL_colors[1],
1913 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1914 PERL_PV_ESCAPE_FIRSTCHAR
1915 )
1916 );
1917 });
3dab1dad
YO
1918 if ( state==1 ) {
1919 OP( convert ) = nodetype;
1920 str=STRING(convert);
1921 STR_LEN(convert)=0;
1922 }
de734bd5
A
1923 while (*ch) {
1924 *str++ = *ch++;
1925 STR_LEN(convert)++;
1926 }
1927
8e11feef 1928 } else {
f9049ba1 1929#ifdef DEBUGGING
8e11feef
RGS
1930 if (state>1)
1931 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1932#endif
8e11feef
RGS
1933 break;
1934 }
1935 }
3dab1dad 1936 if (str) {
8e11feef 1937 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1938 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1939 trie->startstate = state;
07be1b83
YO
1940 trie->minlen -= (state - 1);
1941 trie->maxlen -= (state - 1);
1942 DEBUG_r({
1943 regnode *fix = convert;
de734bd5 1944 U32 word = trie->wordcount;
07be1b83
YO
1945 mjd_nodelen++;
1946 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1947 while( ++fix < n ) {
1948 Set_Node_Offset_Length(fix, 0, 0);
1949 }
de734bd5
A
1950 while (word--) {
1951 SV ** const tmp = av_fetch( trie->words, word, 0 );
1952 if (tmp) {
1953 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1954 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1955 else
1956 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1957 }
1958 }
07be1b83 1959 });
8e11feef
RGS
1960 if (trie->maxlen) {
1961 convert = n;
1962 } else {
3dab1dad 1963 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1964 DEBUG_r(optimize= n);
3dab1dad
YO
1965 }
1966 }
1967 }
a5ca303d
YO
1968 if (!jumper)
1969 jumper = last;
3dab1dad 1970 if ( trie->maxlen ) {
8e11feef
RGS
1971 NEXT_OFF( convert ) = (U16)(tail - convert);
1972 ARG_SET( convert, data_slot );
786e8c11
YO
1973 /* Store the offset to the first unabsorbed branch in
1974 jump[0], which is otherwise unused by the jump logic.
1975 We use this when dumping a trie and during optimisation. */
1976 if (trie->jump)
7f69552c 1977 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1978
786e8c11
YO
1979 /* XXXX */
1980 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1981 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1982 {
1983 OP( convert ) = TRIEC;
1984 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1985 Safefree(trie->bitmap);
1986 trie->bitmap= NULL;
1987 } else
1988 OP( convert ) = TRIE;
a3621e74 1989
3dab1dad
YO
1990 /* store the type in the flags */
1991 convert->flags = nodetype;
a5ca303d
YO
1992 DEBUG_r({
1993 optimize = convert
1994 + NODE_STEP_REGNODE
1995 + regarglen[ OP( convert ) ];
1996 });
1997 /* XXX We really should free up the resource in trie now,
1998 as we won't use them - (which resources?) dmq */
3dab1dad 1999 }
a3621e74 2000 /* needed for dumping*/
e62cc96a 2001 DEBUG_r(if (optimize) {
07be1b83 2002 regnode *opt = convert;
e62cc96a 2003 while ( ++opt < optimize) {
07be1b83
YO
2004 Set_Node_Offset_Length(opt,0,0);
2005 }
786e8c11
YO
2006 /*
2007 Try to clean up some of the debris left after the
2008 optimisation.
a3621e74 2009 */
786e8c11 2010 while( optimize < jumper ) {
07be1b83 2011 mjd_nodelen += Node_Length((optimize));
a3621e74 2012 OP( optimize ) = OPTIMIZED;
07be1b83 2013 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2014 optimize++;
2015 }
07be1b83 2016 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2017 });
2018 } /* end node insert */
07be1b83 2019#ifndef DEBUGGING
6e8b4190 2020 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2021#endif
786e8c11
YO
2022 return trie->jump
2023 ? MADE_JUMP_TRIE
2024 : trie->startstate>1
2025 ? MADE_EXACT_TRIE
2026 : MADE_TRIE;
2027}
2028
2029STATIC void
2030S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2031{
2032/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2033
2034 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2035 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2036 ISBN 0-201-10088-6
2037
2038 We find the fail state for each state in the trie, this state is the longest proper
2039 suffix of the current states 'word' that is also a proper prefix of another word in our
2040 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2041 the DFA not to have to restart after its tried and failed a word at a given point, it
2042 simply continues as though it had been matching the other word in the first place.
2043 Consider
2044 'abcdgu'=~/abcdefg|cdgu/
2045 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2046 fail, which would bring use to the state representing 'd' in the second word where we would
2047 try 'g' and succeed, prodceding to match 'cdgu'.
2048 */
2049 /* add a fail transition */
2050 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2051 U32 *q;
2052 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2053 const U32 numstates = trie->statecount;
786e8c11
YO
2054 const U32 ubound = trie->lasttrans + ucharcount;
2055 U32 q_read = 0;
2056 U32 q_write = 0;
2057 U32 charid;
2058 U32 base = trie->states[ 1 ].trans.base;
2059 U32 *fail;
2060 reg_ac_data *aho;
2061 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2062 GET_RE_DEBUG_FLAGS_DECL;
2063#ifndef DEBUGGING
2064 PERL_UNUSED_ARG(depth);
2065#endif
2066
2067
2068 ARG_SET( stclass, data_slot );
2069 Newxz( aho, 1, reg_ac_data );
2070 RExC_rx->data->data[ data_slot ] = (void*)aho;
2071 aho->trie=trie;
2072 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2073 numstates * sizeof(reg_trie_state));
786e8c11
YO
2074 Newxz( q, numstates, U32);
2075 Newxz( aho->fail, numstates, U32 );
2076 aho->refcount = 1;
2077 fail = aho->fail;
2078 /* initialize fail[0..1] to be 1 so that we always have
2079 a valid final fail state */
2080 fail[ 0 ] = fail[ 1 ] = 1;
2081
2082 for ( charid = 0; charid < ucharcount ; charid++ ) {
2083 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2084 if ( newstate ) {
2085 q[ q_write ] = newstate;
2086 /* set to point at the root */
2087 fail[ q[ q_write++ ] ]=1;
2088 }
2089 }
2090 while ( q_read < q_write) {
2091 const U32 cur = q[ q_read++ % numstates ];
2092 base = trie->states[ cur ].trans.base;
2093
2094 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2095 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2096 if (ch_state) {
2097 U32 fail_state = cur;
2098 U32 fail_base;
2099 do {
2100 fail_state = fail[ fail_state ];
2101 fail_base = aho->states[ fail_state ].trans.base;
2102 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2103
2104 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2105 fail[ ch_state ] = fail_state;
2106 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2107 {
2108 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2109 }
2110 q[ q_write++ % numstates] = ch_state;
2111 }
2112 }
2113 }
2114 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2115 when we fail in state 1, this allows us to use the
2116 charclass scan to find a valid start char. This is based on the principle
2117 that theres a good chance the string being searched contains lots of stuff
2118 that cant be a start char.
2119 */
2120 fail[ 0 ] = fail[ 1 ] = 0;
2121 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2122 PerlIO_printf(Perl_debug_log,
2123 "%*sStclass Failtable (%"UVuf" states): 0",
2124 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2125 );
786e8c11
YO
2126 for( q_read=1; q_read<numstates; q_read++ ) {
2127 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2128 }
2129 PerlIO_printf(Perl_debug_log, "\n");
2130 });
2131 Safefree(q);
2132 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2133}
2134
786e8c11 2135
a3621e74 2136/*
5d1c421c
JH
2137 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2138 * These need to be revisited when a newer toolchain becomes available.
2139 */
2140#if defined(__sparc64__) && defined(__GNUC__)
2141# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2142# undef SPARC64_GCC_WORKAROUND
2143# define SPARC64_GCC_WORKAROUND 1
2144# endif
2145#endif
2146
07be1b83 2147#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2148 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2149 SV * const mysv=sv_newmortal(); \
2150 regnode *Next = regnext(scan); \
2151 regprop(RExC_rx, mysv, scan); \
7f69552c 2152 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2153 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2154 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2155 }});
07be1b83 2156
1de06328
YO
2157
2158
2159
2160
07be1b83
YO
2161#define JOIN_EXACT(scan,min,flags) \
2162 if (PL_regkind[OP(scan)] == EXACT) \
2163 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2164
be8e71aa 2165STATIC U32
07be1b83
YO
2166S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2167 /* Merge several consecutive EXACTish nodes into one. */
2168 regnode *n = regnext(scan);
2169 U32 stringok = 1;
2170 regnode *next = scan + NODE_SZ_STR(scan);
2171 U32 merged = 0;
2172 U32 stopnow = 0;
2173#ifdef DEBUGGING
2174 regnode *stop = scan;
72f13be8 2175 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2176#else
d47053eb
RGS
2177 PERL_UNUSED_ARG(depth);
2178#endif
2179#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2180 PERL_UNUSED_ARG(flags);
2181 PERL_UNUSED_ARG(val);
07be1b83 2182#endif
07be1b83
YO
2183 DEBUG_PEEP("join",scan,depth);
2184
2185 /* Skip NOTHING, merge EXACT*. */
2186 while (n &&
2187 ( PL_regkind[OP(n)] == NOTHING ||
2188 (stringok && (OP(n) == OP(scan))))
2189 && NEXT_OFF(n)
2190 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2191
2192 if (OP(n) == TAIL || n > next)
2193 stringok = 0;
2194 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2195 DEBUG_PEEP("skip:",n,depth);
2196 NEXT_OFF(scan) += NEXT_OFF(n);
2197 next = n + NODE_STEP_REGNODE;
2198#ifdef DEBUGGING
2199 if (stringok)
2200 stop = n;
2201#endif
2202 n = regnext(n);
2203 }
2204 else if (stringok) {
786e8c11 2205 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2206 regnode * const nnext = regnext(n);
2207
2208 DEBUG_PEEP("merg",n,depth);
2209
2210 merged++;
2211 if (oldl + STR_LEN(n) > U8_MAX)
2212 break;
2213 NEXT_OFF(scan) += NEXT_OFF(n);
2214 STR_LEN(scan) += STR_LEN(n);
2215 next = n + NODE_SZ_STR(n);
2216 /* Now we can overwrite *n : */
2217 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2218#ifdef DEBUGGING
2219 stop = next - 1;
2220#endif
2221 n = nnext;
2222 if (stopnow) break;
2223 }
2224
d47053eb
RGS
2225#ifdef EXPERIMENTAL_INPLACESCAN
2226 if (flags && !NEXT_OFF(n)) {
2227 DEBUG_PEEP("atch", val, depth);
2228 if (reg_off_by_arg[OP(n)]) {
2229 ARG_SET(n, val - n);
2230 }
2231 else {
2232 NEXT_OFF(n) = val - n;
2233 }
2234 stopnow = 1;
2235 }
07be1b83
YO
2236#endif
2237 }
2238
2239 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2240 /*
2241 Two problematic code points in Unicode casefolding of EXACT nodes:
2242
2243 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2244 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2245
2246 which casefold to
2247
2248 Unicode UTF-8
2249
2250 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2251 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2252
2253 This means that in case-insensitive matching (or "loose matching",
2254 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2255 length of the above casefolded versions) can match a target string
2256 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2257 This would rather mess up the minimum length computation.
2258
2259 What we'll do is to look for the tail four bytes, and then peek
2260 at the preceding two bytes to see whether we need to decrease
2261 the minimum length by four (six minus two).
2262
2263 Thanks to the design of UTF-8, there cannot be false matches:
2264 A sequence of valid UTF-8 bytes cannot be a subsequence of
2265 another valid sequence of UTF-8 bytes.
2266
2267 */
2268 char * const s0 = STRING(scan), *s, *t;
2269 char * const s1 = s0 + STR_LEN(scan) - 1;
2270 char * const s2 = s1 - 4;
e294cc5d
JH
2271#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2272 const char t0[] = "\xaf\x49\xaf\x42";
2273#else
07be1b83 2274 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2275#endif
07be1b83
YO
2276 const char * const t1 = t0 + 3;
2277
2278 for (s = s0 + 2;
2279 s < s2 && (t = ninstr(s, s1, t0, t1));
2280 s = t + 4) {
e294cc5d
JH
2281#ifdef EBCDIC
2282 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2283 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2284#else
07be1b83
YO
2285 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2286 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2287#endif
07be1b83
YO
2288 *min -= 4;
2289 }
2290 }
2291
2292#ifdef DEBUGGING
2293 /* Allow dumping */
2294 n = scan + NODE_SZ_STR(scan);
2295 while (n <= stop) {
2296 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2297 OP(n) = OPTIMIZED;
2298 NEXT_OFF(n) = 0;
2299 }
2300 n++;
2301 }
2302#endif
2303 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2304 return stopnow;
2305}
2306
653099ff
GS
2307/* REx optimizer. Converts nodes into quickier variants "in place".
2308 Finds fixed substrings. */
2309
a0288114 2310/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2311 to the position after last scanned or to NULL. */
2312
40d049e4
YO
2313#define INIT_AND_WITHP \
2314 assert(!and_withp); \
2315 Newx(and_withp,1,struct regnode_charclass_class); \
2316 SAVEFREEPV(and_withp)
07be1b83 2317
b515a41d
YO
2318/* this is a chain of data about sub patterns we are processing that
2319 need to be handled seperately/specially in study_chunk. Its so
2320 we can simulate recursion without losing state. */
2321struct scan_frame;
2322typedef struct scan_frame {
2323 regnode *last; /* last node to process in this frame */
2324 regnode *next; /* next node to process when last is reached */
2325 struct scan_frame *prev; /*previous frame*/
2326 I32 stop; /* what stopparen do we use */
2327} scan_frame;
2328
76e3520e 2329STATIC I32
40d049e4 2330S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2331 I32 *minlenp, I32 *deltap,
40d049e4
YO
2332 regnode *last,
2333 scan_data_t *data,
2334 I32 stopparen,
2335 U8* recursed,
2336 struct regnode_charclass_class *and_withp,
2337 U32 flags, U32 depth)
c277df42
IZ
2338 /* scanp: Start here (read-write). */
2339 /* deltap: Write maxlen-minlen here. */
2340 /* last: Stop before this one. */
40d049e4
YO
2341 /* data: string data about the pattern */
2342 /* stopparen: treat close N as END */
2343 /* recursed: which subroutines have we recursed into */
2344 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2345{
97aff369 2346 dVAR;
c277df42
IZ
2347 I32 min = 0, pars = 0, code;
2348 regnode *scan = *scanp, *next;
2349 I32 delta = 0;
2350 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2351 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2352 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2353 scan_data_t data_fake;
a3621e74 2354 SV *re_trie_maxbuff = NULL;
786e8c11 2355 regnode *first_non_open = scan;
e2e6a0f1 2356 I32 stopmin = I32_MAX;
8aa23a47
YO
2357 scan_frame *frame = NULL;
2358
a3621e74 2359 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2360
13a24bad 2361#ifdef DEBUGGING
40d049e4 2362 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2363#endif
40d049e4 2364
786e8c11 2365 if ( depth == 0 ) {
40d049e4 2366 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2367 first_non_open=regnext(first_non_open);
2368 }
2369
b81d288d 2370
8aa23a47
YO
2371 fake_study_recurse:
2372 while ( scan && OP(scan) != END && scan < last ){
2373 /* Peephole optimizer: */
2374 DEBUG_STUDYDATA(data,depth);
2375 DEBUG_PEEP("Peep",scan,depth);
2376 JOIN_EXACT(scan,&min,0);
2377
2378 /* Follow the next-chain of the current node and optimize
2379 away all the NOTHINGs from it. */
2380 if (OP(scan) != CURLYX) {
2381 const int max = (reg_off_by_arg[OP(scan)]
2382 ? I32_MAX
2383 /* I32 may be smaller than U16 on CRAYs! */
2384 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2385 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2386 int noff;
2387 regnode *n = scan;
2388
2389 /* Skip NOTHING and LONGJMP. */
2390 while ((n = regnext(n))
2391 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2392 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2393 && off + noff < max)
2394 off += noff;
2395 if (reg_off_by_arg[OP(scan)])
2396 ARG(scan) = off;
2397 else
2398 NEXT_OFF(scan) = off;
2399 }
a3621e74 2400
c277df42 2401
8aa23a47
YO
2402
2403 /* The principal pseudo-switch. Cannot be a switch, since we
2404 look into several different things. */
2405 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2406 || OP(scan) == IFTHEN) {
2407 next = regnext(scan);
2408 code = OP(scan);
2409 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2410
2411 if (OP(next) == code || code == IFTHEN) {
2412 /* NOTE - There is similar code to this block below for handling
2413 TRIE nodes on a re-study. If you change stuff here check there
2414 too. */
2415 I32 max1 = 0, min1 = I32_MAX, num = 0;
2416 struct regnode_charclass_class accum;
2417 regnode * const startbranch=scan;
2418
2419 if (flags & SCF_DO_SUBSTR)
2420 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2421 if (flags & SCF_DO_STCLASS)
2422 cl_init_zero(pRExC_state, &accum);
2423
2424 while (OP(scan) == code) {
2425 I32 deltanext, minnext, f = 0, fake;
2426 struct regnode_charclass_class this_class;
2427
2428 num++;
2429 data_fake.flags = 0;
2430 if (data) {
2431 data_fake.whilem_c = data->whilem_c;
2432 data_fake.last_closep = data->last_closep;
2433 }
2434 else
2435 data_fake.last_closep = &fake;
58e23c8d
YO
2436
2437 data_fake.pos_delta = delta;
8aa23a47
YO
2438 next = regnext(scan);
2439 scan = NEXTOPER(scan);
2440 if (code != BRANCH)
c277df42 2441 scan = NEXTOPER(scan);
8aa23a47
YO
2442 if (flags & SCF_DO_STCLASS) {
2443 cl_init(pRExC_state, &this_class);
2444 data_fake.start_class = &this_class;
2445 f = SCF_DO_STCLASS_AND;
58e23c8d 2446 }
8aa23a47
YO
2447 if (flags & SCF_WHILEM_VISITED_POS)
2448 f |= SCF_WHILEM_VISITED_POS;
2449
2450 /* we suppose the run is continuous, last=next...*/
2451 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2452 next, &data_fake,
2453 stopparen, recursed, NULL, f,depth+1);
2454 if (min1 > minnext)
2455 min1 = minnext;
2456 if (max1 < minnext + deltanext)
2457 max1 = minnext + deltanext;
2458 if (deltanext == I32_MAX)
2459 is_inf = is_inf_internal = 1;
2460 scan = next;
2461 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2462 pars++;
2463 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2464 if ( stopmin > minnext)
2465 stopmin = min + min1;
2466 flags &= ~SCF_DO_SUBSTR;
2467 if (data)
2468 data->flags |= SCF_SEEN_ACCEPT;
2469 }
2470 if (data) {
2471 if (data_fake.flags & SF_HAS_EVAL)
2472 data->flags |= SF_HAS_EVAL;
2473 data->whilem_c = data_fake.whilem_c;
3dab1dad 2474 }
8aa23a47
YO
2475 if (flags & SCF_DO_STCLASS)
2476 cl_or(pRExC_state, &accum, &this_class);
2477 }
2478 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2479 min1 = 0;
2480 if (flags & SCF_DO_SUBSTR) {
2481 data->pos_min += min1;
2482 data->pos_delta += max1 - min1;
2483 if (max1 != min1 || is_inf)
2484 data->longest = &(data->longest_float);
2485 }
2486 min += min1;
2487 delta += max1 - min1;
2488 if (flags & SCF_DO_STCLASS_OR) {
2489 cl_or(pRExC_state, data->start_class, &accum);
2490 if (min1) {
2491 cl_and(data->start_class, and_withp);
2492 flags &= ~SCF_DO_STCLASS;
653099ff 2493 }
8aa23a47
YO
2494 }
2495 else if (flags & SCF_DO_STCLASS_AND) {
2496 if (min1) {
2497 cl_and(data->start_class, &accum);
2498 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2499 }
8aa23a47
YO
2500 else {
2501 /* Switch to OR mode: cache the old value of
2502 * data->start_class */
2503 INIT_AND_WITHP;
2504 StructCopy(data->start_class, and_withp,
2505 struct regnode_charclass_class);
2506 flags &= ~SCF_DO_STCLASS_AND;
2507 StructCopy(&accum, data->start_class,
2508 struct regnode_charclass_class);
2509 flags |= SCF_DO_STCLASS_OR;
2510 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2511 }
8aa23a47 2512 }
a3621e74 2513
8aa23a47
YO
2514 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2515 /* demq.
a3621e74 2516
8aa23a47
YO
2517 Assuming this was/is a branch we are dealing with: 'scan' now
2518 points at the item that follows the branch sequence, whatever
2519 it is. We now start at the beginning of the sequence and look
2520 for subsequences of
a3621e74 2521
8aa23a47
YO
2522 BRANCH->EXACT=>x1
2523 BRANCH->EXACT=>x2
2524 tail
a3621e74 2525
8aa23a47 2526 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2527
8aa23a47
YO
2528 If we can find such a subseqence we need to turn the first
2529 element into a trie and then add the subsequent branch exact
2530 strings to the trie.
a3621e74 2531
8aa23a47 2532 We have two cases
a3621e74 2533
8aa23a47 2534 1. patterns where the whole set of branch can be converted.
a3621e74 2535
8aa23a47 2536 2. patterns where only a subset can be converted.
a3621e74 2537
8aa23a47
YO
2538 In case 1 we can replace the whole set with a single regop
2539 for the trie. In case 2 we need to keep the start and end
2540 branchs so
a3621e74 2541
8aa23a47
YO
2542 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2543 becomes BRANCH TRIE; BRANCH X;
786e8c11 2544
8aa23a47
YO
2545 There is an additional case, that being where there is a
2546 common prefix, which gets split out into an EXACT like node
2547 preceding the TRIE node.
a3621e74 2548
8aa23a47
YO
2549 If x(1..n)==tail then we can do a simple trie, if not we make
2550 a "jump" trie, such that when we match the appropriate word
2551 we "jump" to the appopriate tail node. Essentailly we turn
2552 a nested if into a case structure of sorts.
b515a41d 2553
8aa23a47
YO
2554 */
2555
2556 int made=0;
2557 if (!re_trie_maxbuff) {
2558 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2559 if (!SvIOK(re_trie_maxbuff))
2560 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2561 }
2562 if ( SvIV(re_trie_maxbuff)>=0 ) {
2563 regnode *cur;
2564 regnode *first = (regnode *)NULL;
2565 regnode *last = (regnode *)NULL;
2566 regnode *tail = scan;
2567 U8 optype = 0;
2568 U32 count=0;
a3621e74
YO
2569
2570#ifdef DEBUGGING
8aa23a47 2571 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2572#endif
8aa23a47
YO
2573 /* var tail is used because there may be a TAIL
2574 regop in the way. Ie, the exacts will point to the
2575 thing following the TAIL, but the last branch will
2576 point at the TAIL. So we advance tail. If we
2577 have nested (?:) we may have to move through several
2578 tails.
2579 */
2580
2581 while ( OP( tail ) == TAIL ) {
2582 /* this is the TAIL generated by (?:) */
2583 tail = regnext( tail );
2584 }
a3621e74 2585
8aa23a47
YO
2586
2587 DEBUG_OPTIMISE_r({
2588 regprop(RExC_rx, mysv, tail );
2589 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2590 (int)depth * 2 + 2, "",
2591 "Looking for TRIE'able sequences. Tail node is: ",
2592 SvPV_nolen_const( mysv )
2593 );
2594 });
2595
2596 /*
2597
2598 step through the branches, cur represents each
2599 branch, noper is the first thing to be matched
2600 as part of that branch and noper_next is the
2601 regnext() of that node. if noper is an EXACT
2602 and noper_next is the same as scan (our current
2603 position in the regex) then the EXACT branch is
2604 a possible optimization target. Once we have
2605 two or more consequetive such branches we can
2606 create a trie of the EXACT's contents and stich
2607 it in place. If the sequence represents all of
2608 the branches we eliminate the whole thing and
2609 replace it with a single TRIE. If it is a
2610 subsequence then we need to stitch it in. This
2611 means the first branch has to remain, and needs
2612 to be repointed at the item on the branch chain
2613 following the last branch optimized. This could
2614 be either a BRANCH, in which case the
2615 subsequence is internal, or it could be the
2616 item following the branch sequence in which
2617 case the subsequence is at the end.
2618
2619 */
2620
2621 /* dont use tail as the end marker for this traverse */
2622 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2623 regnode * const noper = NEXTOPER( cur );
b515a41d 2624#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2625 regnode * const noper_next = regnext( noper );
b515a41d
YO
2626#endif
2627
8aa23a47
YO
2628 DEBUG_OPTIMISE_r({
2629 regprop(RExC_rx, mysv, cur);
2630 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2631 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2632
2633 regprop(RExC_rx, mysv, noper);
2634 PerlIO_printf( Perl_debug_log, " -> %s",
2635 SvPV_nolen_const(mysv));
2636
2637 if ( noper_next ) {
2638 regprop(RExC_rx, mysv, noper_next );
2639 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2640 SvPV_nolen_const(mysv));
2641 }
2642 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2643 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2644 });
2645 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2646 : PL_regkind[ OP( noper ) ] == EXACT )
2647 || OP(noper) == NOTHING )
786e8c11 2648#ifdef NOJUMPTRIE
8aa23a47 2649 && noper_next == tail
786e8c11 2650#endif
8aa23a47
YO
2651 && count < U16_MAX)
2652 {
2653 count++;
2654 if ( !first || optype == NOTHING ) {
2655 if (!first) first = cur;
2656 optype = OP( noper );
2657 } else {
2658 last = cur;
2659 }
2660 } else {
2661 if ( last ) {
2662 make_trie( pRExC_state,
2663 startbranch, first, cur, tail, count,
2664 optype, depth+1 );
2665 }
2666 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2667#ifdef NOJUMPTRIE
8aa23a47 2668 && noper_next == tail
786e8c11 2669#endif
8aa23a47
YO
2670 ){
2671 count = 1;
2672 first = cur;
2673 optype = OP( noper );
2674 } else {
2675 count = 0;
2676 first = NULL;
2677 optype = 0;
2678 }
2679 last = NULL;
2680 }
2681 }
2682 DEBUG_OPTIMISE_r({
2683 regprop(RExC_rx, mysv, cur);
2684 PerlIO_printf( Perl_debug_log,
2685 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2686 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2687
2688 });
2689 if ( last ) {
2690 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2691#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2692 if ( ((made == MADE_EXACT_TRIE &&
2693 startbranch == first)
2694 || ( first_non_open == first )) &&
2695 depth==0 ) {
2696 flags |= SCF_TRIE_RESTUDY;
2697 if ( startbranch == first
2698 && scan == tail )
2699 {
2700 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2701 }
2702 }
3dab1dad 2703#endif
8aa23a47
YO
2704 }
2705 }
2706
2707 } /* do trie */
2708
653099ff 2709 }
8aa23a47
YO
2710 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2711 scan = NEXTOPER(NEXTOPER(scan));
2712 } else /* single branch is optimized. */
2713 scan = NEXTOPER(scan);
2714 continue;
2715 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2716 scan_frame *newframe = NULL;
2717 I32 paren;
2718 regnode *start;
2719 regnode *end;
2720
2721 if (OP(scan) != SUSPEND) {
2722 /* set the pointer */
2723 if (OP(scan) == GOSUB) {
2724 paren = ARG(scan);
2725 RExC_recurse[ARG2L(scan)] = scan;
2726 start = RExC_open_parens[paren-1];
2727 end = RExC_close_parens[paren-1];
2728 } else {
2729 paren = 0;
2730 start = RExC_rx->program + 1;
2731 end = RExC_opend;
2732 }
2733 if (!recursed) {
2734 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2735 SAVEFREEPV(recursed);
2736 }
2737 if (!PAREN_TEST(recursed,paren+1)) {
2738 PAREN_SET(recursed,paren+1);
2739 Newx(newframe,1,scan_frame);
2740 } else {
2741 if (flags & SCF_DO_SUBSTR) {
2742 scan_commit(pRExC_state,data,minlenp);
2743 data->longest = &(data->longest_float);
2744 }
2745 is_inf = is_inf_internal = 1;
2746 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2747 cl_anything(pRExC_state, data->start_class);
2748 flags &= ~SCF_DO_STCLASS;
2749 }
2750 } else {
2751 Newx(newframe,1,scan_frame);
2752 paren = stopparen;
2753 start = scan+2;
2754 end = regnext(scan);
2755 }
2756 if (newframe) {
2757 assert(start);
2758 assert(end);
2759 SAVEFREEPV(newframe);
2760 newframe->next = regnext(scan);
2761 newframe->last = last;
2762 newframe->stop = stopparen;
2763 newframe->prev = frame;
2764
2765 frame = newframe;
2766 scan = start;
2767 stopparen = paren;
2768 last = end;
2769
2770 continue;
2771 }
2772 }
2773 else if (OP(scan) == EXACT) {
2774 I32 l = STR_LEN(scan);
2775 UV uc;
2776 if (UTF) {
2777 const U8 * const s = (U8*)STRING(scan);
2778 l = utf8_length(s, s + l);
2779 uc = utf8_to_uvchr(s, NULL);
2780 } else {
2781 uc = *((U8*)STRING(scan));
2782 }
2783 min += l;
2784 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2785 /* The code below prefers earlier match for fixed
2786 offset, later match for variable offset. */
2787 if (data->last_end == -1) { /* Update the start info. */
2788 data->last_start_min = data->pos_min;
2789 data->last_start_max = is_inf
2790 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2791 }
8aa23a47
YO
2792 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2793 if (UTF)
2794 SvUTF8_on(data->last_found);
2795 {
2796 SV * const sv = data->last_found;
2797 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2798 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2799 if (mg && mg->mg_len >= 0)
2800 mg->mg_len += utf8_length((U8*)STRING(scan),
2801 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2802 }
8aa23a47
YO
2803 data->last_end = data->pos_min + l;
2804 data->pos_min += l; /* As in the first entry. */
2805 data->flags &= ~SF_BEFORE_EOL;
2806 }
2807 if (flags & SCF_DO_STCLASS_AND) {
2808 /* Check whether it is compatible with what we know already! */
2809 int compat = 1;
2810
2811 if (uc >= 0x100 ||
2812 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2813 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2814 && (!(data->start_class->flags & ANYOF_FOLD)
2815 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2816 )
2817 compat = 0;
2818 ANYOF_CLASS_ZERO(data->start_class);
2819 ANYOF_BITMAP_ZERO(data->start_class);
2820 if (compat)
2821 ANYOF_BITMAP_SET(data->start_class, uc);
2822 data->start_class->flags &= ~ANYOF_EOS;
2823 if (uc < 0x100)
2824 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2825 }
2826 else if (flags & SCF_DO_STCLASS_OR) {
2827 /* false positive possible if the class is case-folded */
2828 if (uc < 0x100)
2829 ANYOF_BITMAP_SET(data->start_class, uc);
2830 else
2831 data->start_class->flags |= ANYOF_UNICODE_ALL;
2832 data->start_class->flags &= ~ANYOF_EOS;
2833 cl_and(data->start_class, and_withp);
2834 }
2835 flags &= ~SCF_DO_STCLASS;
2836 }
2837 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2838 I32 l = STR_LEN(scan);
2839 UV uc = *((U8*)STRING(scan));
2840
2841 /* Search for fixed substrings supports EXACT only. */
2842 if (flags & SCF_DO_SUBSTR) {
2843 assert(data);
2844 scan_commit(pRExC_state, data, minlenp);
2845 }
2846 if (UTF) {
2847 const U8 * const s = (U8 *)STRING(scan);
2848 l = utf8_length(s, s + l);
2849 uc = utf8_to_uvchr(s, NULL);
2850 }
2851 min += l;
2852 if (flags & SCF_DO_SUBSTR)
2853 data->pos_min += l;
2854 if (flags & SCF_DO_STCLASS_AND) {
2855 /* Check whether it is compatible with what we know already! */
2856 int compat = 1;
2857
2858 if (uc >= 0x100 ||
2859 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2860 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2861 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2862 compat = 0;
2863 ANYOF_CLASS_ZERO(data->start_class);
2864 ANYOF_BITMAP_ZERO(data->start_class);
2865 if (compat) {
2866 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2867 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2868 data->start_class->flags |= ANYOF_FOLD;
2869 if (OP(scan) == EXACTFL)
2870 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2871 }
8aa23a47
YO
2872 }
2873 else if (flags & SCF_DO_STCLASS_OR) {
2874 if (data->start_class->flags & ANYOF_FOLD) {
2875 /* false positive possible if the class is case-folded.
2876 Assume that the locale settings are the same... */
1aa99e6b
IH
2877 if (uc < 0x100)
2878 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2879 data->start_class->flags &= ~ANYOF_EOS;
2880 }
8aa23a47 2881 cl_and(data->start_class, and_withp);
653099ff 2882 }
8aa23a47
YO
2883 flags &= ~SCF_DO_STCLASS;
2884 }
2885 else if (strchr((const char*)PL_varies,OP(scan))) {
2886 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2887 I32 f = flags, pos_before = 0;
2888 regnode * const oscan = scan;
2889 struct regnode_charclass_class this_class;
2890 struct regnode_charclass_class *oclass = NULL;
2891 I32 next_is_eval = 0;
2892
2893 switch (PL_regkind[OP(scan)]) {
2894 case WHILEM: /* End of (?:...)* . */
2895 scan = NEXTOPER(scan);
2896 goto finish;
2897 case PLUS:
2898 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2899 next = NEXTOPER(scan);
2900 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2901 mincount = 1;
2902 maxcount = REG_INFTY;
2903 next = regnext(scan);
2904 scan = NEXTOPER(scan);
2905 goto do_curly;
2906 }
2907 }
2908 if (flags & SCF_DO_SUBSTR)
2909 data->pos_min++;
2910 min++;
2911 /* Fall through. */
2912 case STAR:
2913 if (flags & SCF_DO_STCLASS) {
2914 mincount = 0;
2915 maxcount = REG_INFTY;
2916 next = regnext(scan);
2917 scan = NEXTOPER(scan);
2918 goto do_curly;
2919 }
2920 is_inf = is_inf_internal = 1;
2921 scan = regnext(scan);
c277df42 2922 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2923 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2924 data->longest = &(data->longest_float);
c277df42 2925 }
8aa23a47
YO
2926 goto optimize_curly_tail;
2927 case CURLY:
2928 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2929 && (scan->flags == stopparen))
2930 {
2931 mincount = 1;
2932 maxcount = 1;
2933 } else {
2934 mincount = ARG1(scan);
2935 maxcount = ARG2(scan);
653099ff 2936 }
8aa23a47
YO
2937 next = regnext(scan);
2938 if (OP(scan) == CURLYX) {
2939 I32 lp = (data ? *(data->last_closep) : 0);
2940 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2941 }
8aa23a47
YO
2942 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2943 next_is_eval = (OP(scan) == EVAL);
2944 do_curly:
2945 if (flags & SCF_DO_SUBSTR) {
2946 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2947 pos_before = data->pos_min;
b45f050a 2948 }
8aa23a47
YO
2949 if (data) {
2950 fl = data->flags;
2951 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2952 if (is_inf)
2953 data->flags |= SF_IS_INF;
2954 }
2955 if (flags & SCF_DO_STCLASS) {
2956 cl_init(pRExC_state, &this_class);
2957 oclass = data->start_class;
2958 data->start_class = &this_class;
2959 f |= SCF_DO_STCLASS_AND;
2960 f &= ~SCF_DO_STCLASS_OR;
2961 }
2962 /* These are the cases when once a subexpression
2963 fails at a particular position, it cannot succeed
2964 even after backtracking at the enclosing scope.
2965
2966 XXXX what if minimal match and we are at the
2967 initial run of {n,m}? */
2968 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2969 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2970
8aa23a47
YO
2971 /* This will finish on WHILEM, setting scan, or on NULL: */
2972 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2973 last, data, stopparen, recursed, NULL,
2974 (mincount == 0
2975 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2976
8aa23a47
YO
2977 if (flags & SCF_DO_STCLASS)
2978 data->start_class = oclass;
2979 if (mincount == 0 || minnext == 0) {
2980 if (flags & SCF_DO_STCLASS_OR) {
2981 cl_or(pRExC_state, data->start_class, &this_class);
2982 }
2983 else if (flags & SCF_DO_STCLASS_AND) {
2984 /* Switch to OR mode: cache the old value of
2985 * data->start_class */
2986 INIT_AND_WITHP;
2987 StructCopy(data->start_class, and_withp,
2988 struct regnode_charclass_class);
2989 flags &= ~SCF_DO_STCLASS_AND;
2990 StructCopy(&this_class, data->start_class,
2991 struct regnode_charclass_class);
2992 flags |= SCF_DO_STCLASS_OR;
2993 data->start_class->flags |= ANYOF_EOS;
2994 }
2995 } else { /* Non-zero len */
2996 if (flags & SCF_DO_STCLASS_OR) {
2997 cl_or(pRExC_state, data->start_class, &this_class);
2998 cl_and(data->start_class, and_withp);
2999 }
3000 else if (flags & SCF_DO_STCLASS_AND)
3001 cl_and(data->start_class, &this_class);
3002 flags &= ~SCF_DO_STCLASS;
3003 }
3004 if (!scan) /* It was not CURLYX, but CURLY. */
3005 scan = next;
3006 if ( /* ? quantifier ok, except for (?{ ... }) */
3007 (next_is_eval || !(mincount == 0 && maxcount == 1))
3008 && (minnext == 0) && (deltanext == 0)
3009 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3010 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3011 && ckWARN(WARN_REGEXP))
3012 {
3013 vWARN(RExC_parse,
3014 "Quantifier unexpected on zero-length expression");
3015 }
3016
3017 min += minnext * mincount;
3018 is_inf_internal |= ((maxcount == REG_INFTY
3019 && (minnext + deltanext) > 0)
3020 || deltanext == I32_MAX);
3021 is_inf |= is_inf_internal;
3022 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3023
3024 /* Try powerful optimization CURLYX => CURLYN. */
3025 if ( OP(oscan) == CURLYX && data
3026 && data->flags & SF_IN_PAR
3027 && !(data->flags & SF_HAS_EVAL)
3028 && !deltanext && minnext == 1 ) {
3029 /* Try to optimize to CURLYN. */
3030 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3031 regnode * const nxt1 = nxt;
497b47a8 3032#ifdef DEBUGGING
8aa23a47 3033 regnode *nxt2;
497b47a8 3034#endif
c277df42 3035
8aa23a47
YO
3036 /* Skip open. */
3037 nxt = regnext(nxt);
3038 if (!strchr((const char*)PL_simple,OP(nxt))
3039 && !(PL_regkind[OP(nxt)] == EXACT
3040 && STR_LEN(nxt) == 1))
3041 goto nogo;
497b47a8 3042#ifdef DEBUGGING
8aa23a47 3043 nxt2 = nxt;
497b47a8 3044#endif
8aa23a47
YO
3045 nxt = regnext(nxt);
3046 if (OP(nxt) != CLOSE)
3047 goto nogo;
3048 if (RExC_open_parens) {
3049 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3050 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3051 }
3052 /* Now we know that nxt2 is the only contents: */
3053 oscan->flags = (U8)ARG(nxt);
3054 OP(oscan) = CURLYN;
3055 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3056
c277df42 3057#ifdef DEBUGGING
8aa23a47
YO
3058 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3059 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3060 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3061 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3062 OP(nxt + 1) = OPTIMIZED; /* was count. */
3063 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3064#endif
8aa23a47
YO
3065 }
3066 nogo:
3067
3068 /* Try optimization CURLYX => CURLYM. */
3069 if ( OP(oscan) == CURLYX && data
3070 && !(data->flags & SF_HAS_PAR)
3071 && !(data->flags & SF_HAS_EVAL)
3072 && !deltanext /* atom is fixed width */
3073 && minnext != 0 /* CURLYM can't handle zero width */
3074 ) {
3075 /* XXXX How to optimize if data == 0? */
3076 /* Optimize to a simpler form. */
3077 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3078 regnode *nxt2;
3079
3080 OP(oscan) = CURLYM;
3081 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3082 && (OP(nxt2) != WHILEM))
3083 nxt = nxt2;
3084 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3085 /* Need to optimize away parenths. */
3086 if (data->flags & SF_IN_PAR) {
3087 /* Set the parenth number. */
3088 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3089
3090 if (OP(nxt) != CLOSE)
3091 FAIL("Panic opt close");
3092 oscan->flags = (U8)ARG(nxt);
3093 if (RExC_open_parens) {
3094 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3095 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3096 }
8aa23a47
YO
3097 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3098 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3099
c277df42 3100#ifdef DEBUGGING
8aa23a47
YO
3101 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3102 OP(nxt + 1) = OPTIMIZED; /* was count. */
3103 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3104 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3105#endif
c277df42 3106#if 0
8aa23a47
YO
3107 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3108 regnode *nnxt = regnext(nxt1);
3109
3110 if (nnxt == nxt) {
3111 if (reg_off_by_arg[OP(nxt1)])
3112 ARG_SET(nxt1, nxt2 - nxt1);
3113 else if (nxt2 - nxt1 < U16_MAX)
3114 NEXT_OFF(nxt1) = nxt2 - nxt1;
3115 else
3116 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3117 }
8aa23a47 3118 nxt1 = nnxt;
c277df42 3119 }
5d1c421c 3120#endif
8aa23a47
YO
3121 /* Optimize again: */
3122 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3123 NULL, stopparen, recursed, NULL, 0,depth+1);
3124 }
3125 else
3126 oscan->flags = 0;
3127 }
3128 else if ((OP(oscan) == CURLYX)
3129 && (flags & SCF_WHILEM_VISITED_POS)
3130 /* See the comment on a similar expression above.
3131 However, this time it not a subexpression
3132 we care about, but the expression itself. */
3133 && (maxcount == REG_INFTY)
3134 && data && ++data->whilem_c < 16) {
3135 /* This stays as CURLYX, we can put the count/of pair. */
3136 /* Find WHILEM (as in regexec.c) */
3137 regnode *nxt = oscan + NEXT_OFF(oscan);
3138
3139 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3140 nxt += ARG(nxt);
3141 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3142 | (RExC_whilem_seen << 4)); /* On WHILEM */
3143 }
3144 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3145 pars++;
3146 if (flags & SCF_DO_SUBSTR) {
3147 SV *last_str = NULL;
3148 int counted = mincount != 0;
a0ed51b3 3149
8aa23a47
YO
3150 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3151#if defined(SPARC64_GCC_WORKAROUND)
3152 I32 b = 0;
3153 STRLEN l = 0;
3154 const char *s = NULL;
3155 I32 old = 0;
b515a41d 3156
8aa23a47
YO
3157 if (pos_before >= data->last_start_min)
3158 b = pos_before;
3159 else
3160 b = data->last_start_min;
b515a41d 3161
8aa23a47
YO
3162 l = 0;
3163 s = SvPV_const(data->last_found, l);
3164 old = b - data->last_start_min;
3165
3166#else
3167 I32 b = pos_before >= data->last_start_min
3168 ? pos_before : data->last_start_min;
3169 STRLEN l;
3170 const char * const s = SvPV_const(data->last_found, l);
3171 I32 old = b - data->last_start_min;
3172#endif
3173
3174 if (UTF)
3175 old = utf8_hop((U8*)s, old) - (U8*)s;
3176
3177 l -= old;
3178 /* Get the added string: */
3179 last_str = newSVpvn(s + old, l);
3180 if (UTF)
3181 SvUTF8_on(last_str);
3182 if (deltanext == 0 && pos_before == b) {
3183 /* What was added is a constant string */
3184 if (mincount > 1) {
3185 SvGROW(last_str, (mincount * l) + 1);
3186 repeatcpy(SvPVX(last_str) + l,
3187 SvPVX_const(last_str), l, mincount - 1);
3188 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3189 /* Add additional parts. */
3190 SvCUR_set(data->last_found,
3191 SvCUR(data->last_found) - l);
3192 sv_catsv(data->last_found, last_str);
3193 {
3194 SV * sv = data->last_found;
3195 MAGIC *mg =
3196 SvUTF8(sv) && SvMAGICAL(sv) ?
3197 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3198 if (mg && mg->mg_len >= 0)
3199 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3200 }
8aa23a47 3201 data->last_end += l * (mincount - 1);
b515a41d 3202 }
8aa23a47
YO
3203 } else {
3204 /* start offset must point into the last copy */
3205 data->last_start_min += minnext * (mincount - 1);
3206 data->last_start_max += is_inf ? I32_MAX
3207 : (maxcount - 1) * (minnext + data->pos_delta);
3208 }
c277df42 3209 }
8aa23a47
YO
3210 /* It is counted once already... */
3211 data->pos_min += minnext * (mincount - counted);
3212 data->pos_delta += - counted * deltanext +
3213 (minnext + deltanext) * maxcount - minnext * mincount;
3214 if (mincount != maxcount) {
3215 /* Cannot extend fixed substrings found inside
3216 the group. */
3217 scan_commit(pRExC_state,data,minlenp);
3218 if (mincount && last_str) {
3219 SV * const sv = data->last_found;
3220 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3221 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3222
3223 if (mg)
3224 mg->mg_len = -1;
3225 sv_setsv(sv, last_str);
3226 data->last_end = data->pos_min;
3227 data->last_start_min =
3228 data->pos_min - CHR_SVLEN(last_str);
3229 data->last_start_max = is_inf
3230 ? I32_MAX
3231 : data->pos_min + data->pos_delta
3232 - CHR_SVLEN(last_str);
3233 }
3234 data->longest = &(data->longest_float);
3235 }
3236 SvREFCNT_dec(last_str);
c277df42 3237 }
8aa23a47
YO
3238 if (data && (fl & SF_HAS_EVAL))
3239 data->flags |= SF_HAS_EVAL;
3240 optimize_curly_tail:
3241 if (OP(oscan) != CURLYX) {
3242 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3243 && NEXT_OFF(next))
3244 NEXT_OFF(oscan) += NEXT_OFF(next);
3245 }
3246 continue;
3247 default: /* REF and CLUMP only? */
3248 if (flags & SCF_DO_SUBSTR) {
3249 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3250 data->longest = &(data->longest_float);
3251 }
3252 is_inf = is_inf_internal = 1;
3253 if (flags & SCF_DO_STCLASS_OR)
3254 cl_anything(pRExC_state, data->start_class);
3255 flags &= ~SCF_DO_STCLASS;
3256 break;
c277df42 3257 }
8aa23a47
YO
3258 }
3259 else if (strchr((const char*)PL_simple,OP(scan))) {
3260 int value = 0;
653099ff 3261
8aa23a47
YO
3262 if (flags & SCF_DO_SUBSTR) {
3263 scan_commit(pRExC_state,data,minlenp);
3264 data->pos_min++;
3265 }
3266 min++;
3267 if (flags & SCF_DO_STCLASS) {
3268 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3269
8aa23a47
YO
3270 /* Some of the logic below assumes that switching
3271 locale on will only add false positives. */
3272 switch (PL_regkind[OP(scan)]) {
3273 case SANY:
3274 default:
3275 do_default:
3276 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3277 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3278 cl_anything(pRExC_state, data->start_class);
3279 break;
3280 case REG_ANY:
3281 if (OP(scan) == SANY)
3282 goto do_default;
3283 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3284 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3285 || (data->start_class->flags & ANYOF_CLASS));
3286 cl_anything(pRExC_state, data->start_class);
653099ff 3287 }
8aa23a47
YO
3288 if (flags & SCF_DO_STCLASS_AND || !value)
3289 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3290 break;
3291 case ANYOF:
3292 if (flags & SCF_DO_STCLASS_AND)
3293 cl_and(data->start_class,
3294 (struct regnode_charclass_class*)scan);
653099ff 3295 else
8aa23a47
YO
3296 cl_or(pRExC_state, data->start_class,
3297 (struct regnode_charclass_class*)scan);
3298 break;
3299 case ALNUM:
3300 if (flags & SCF_DO_STCLASS_AND) {
3301 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3302 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3303 for (value = 0; value < 256; value++)
3304 if (!isALNUM(value))
3305 ANYOF_BITMAP_CLEAR(data->start_class, value);
3306 }
653099ff 3307 }
8aa23a47
YO
3308 else {
3309 if (data->start_class->flags & ANYOF_LOCALE)
3310 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3311 else {
3312 for (value = 0; value < 256; value++)
3313 if (isALNUM(value))
3314 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3315 }
8aa23a47
YO
3316 }
3317 break;
3318 case ALNUML:
3319 if (flags & SCF_DO_STCLASS_AND) {
3320 if (data->start_class->flags & ANYOF_LOCALE)
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3322 }
3323 else {
3324 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3325 data->start_class->flags |= ANYOF_LOCALE;
3326 }
3327 break;
3328 case NALNUM:
3329 if (flags & SCF_DO_STCLASS_AND) {
3330 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3331 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3332 for (value = 0; value < 256; value++)
3333 if (isALNUM(value))
3334 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3335 }
3336 }
8aa23a47
YO
3337 else {
3338 if (data->start_class->flags & ANYOF_LOCALE)
3339 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3340 else {
3341 for (value = 0; value < 256; value++)
3342 if (!isALNUM(value))
3343 ANYOF_BITMAP_SET(data->start_class, value);
3344 }
653099ff 3345 }
8aa23a47
YO
3346 break;
3347 case NALNUML:
3348 if (flags & SCF_DO_STCLASS_AND) {
3349 if (data->start_class->flags & ANYOF_LOCALE)
3350 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3351 }
8aa23a47
YO
3352 else {
3353 data->start_class->flags |= ANYOF_LOCALE;
3354 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3355 }
3356 break;
3357 case SPACE:
3358 if (flags & SCF_DO_STCLASS_AND) {
3359 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3361 for (value = 0; value < 256; value++)
3362 if (!isSPACE(value))
3363 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3364 }
3365 }
8aa23a47
YO
3366 else {
3367 if (data->start_class->flags & ANYOF_LOCALE)
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3369 else {
3370 for (value = 0; value < 256; value++)
3371 if (isSPACE(value))
3372 ANYOF_BITMAP_SET(data->start_class, value);
3373 }
653099ff 3374 }
8aa23a47
YO
3375 break;
3376 case SPACEL:
3377 if (flags & SCF_DO_STCLASS_AND) {
3378 if (data->start_class->flags & ANYOF_LOCALE)
3379 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3380 }
3381 else {
3382 data->start_class->flags |= ANYOF_LOCALE;
3383 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3384 }
3385 break;
3386 case NSPACE:
3387 if (flags & SCF_DO_STCLASS_AND) {
3388 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3389 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3390 for (value = 0; value < 256; value++)
3391 if (isSPACE(value))
3392 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3393 }
8aa23a47
YO
3394 }
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (!isSPACE(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3402 }
3403 }
8aa23a47
YO
3404 break;
3405 case NSPACEL:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 if (data->start_class->flags & ANYOF_LOCALE) {
3408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3409 for (value = 0; value < 256; value++)
3410 if (!isSPACE(value))
3411 ANYOF_BITMAP_CLEAR(data->start_class, value);
3412 }
653099ff 3413 }
8aa23a47
YO
3414 else {
3415 data->start_class->flags |= ANYOF_LOCALE;
3416 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3417 }
3418 break;
3419 case DIGIT:
3420 if (flags & SCF_DO_STCLASS_AND) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3422 for (value = 0; value < 256; value++)
3423 if (!isDIGIT(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
3425 }
3426 else {
3427 if (data->start_class->flags & ANYOF_LOCALE)
3428 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3429 else {
3430 for (value = 0; value < 256; value++)
3431 if (isDIGIT(value))
3432 ANYOF_BITMAP_SET(data->start_class, value);
3433 }
3434 }
3435 break;
3436 case NDIGIT:
3437 if (flags & SCF_DO_STCLASS_AND) {
3438 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3439 for (value = 0; value < 256; value++)
3440 if (isDIGIT(value))
3441 ANYOF_BITMAP_CLEAR(data->start_class, value);
3442 }
3443 else {
3444 if (data->start_class->flags & ANYOF_LOCALE)
3445 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3446 else {
3447 for (value = 0; value < 256; value++)
3448 if (!isDIGIT(value))
3449 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3450 }
3451 }
8aa23a47
YO
3452 break;
3453 }
3454 if (flags & SCF_DO_STCLASS_OR)
3455 cl_and(data->start_class, and_withp);
3456 flags &= ~SCF_DO_STCLASS;
3457 }
3458 }
3459 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3460 data->flags |= (OP(scan) == MEOL
3461 ? SF_BEFORE_MEOL
3462 : SF_BEFORE_SEOL);
3463 }
3464 else if ( PL_regkind[OP(scan)] == BRANCHJ
3465 /* Lookbehind, or need to calculate parens/evals/stclass: */
3466 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3467 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3468 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3469 || OP(scan) == UNLESSM )
3470 {
3471 /* Negative Lookahead/lookbehind
3472 In this case we can't do fixed string optimisation.
3473 */
1de06328 3474
8aa23a47
YO
3475 I32 deltanext, minnext, fake = 0;
3476 regnode *nscan;
3477 struct regnode_charclass_class intrnl;
3478 int f = 0;
1de06328 3479
8aa23a47
YO
3480 data_fake.flags = 0;
3481 if (data) {
3482 data_fake.whilem_c = data->whilem_c;
3483 data_fake.last_closep = data->last_closep;
c277df42 3484 }
8aa23a47
YO
3485 else
3486 data_fake.last_closep = &fake;
58e23c8d 3487 data_fake.pos_delta = delta;
8aa23a47
YO
3488 if ( flags & SCF_DO_STCLASS && !scan->flags
3489 && OP(scan) == IFMATCH ) { /* Lookahead */
3490 cl_init(pRExC_state, &intrnl);
3491 data_fake.start_class = &intrnl;
3492 f |= SCF_DO_STCLASS_AND;
3493 }
3494 if (flags & SCF_WHILEM_VISITED_POS)
3495 f |= SCF_WHILEM_VISITED_POS;
3496 next = regnext(scan);
3497 nscan = NEXTOPER(NEXTOPER(scan));
3498 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3499 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3500 if (scan->flags) {
3501 if (deltanext) {
58e23c8d 3502 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3503 }
3504 else if (minnext > (I32)U8_MAX) {
58e23c8d 3505 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3506 }
3507 scan->flags = (U8)minnext;
3508 }
3509 if (data) {
3510 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3511 pars++;
3512 if (data_fake.flags & SF_HAS_EVAL)
3513 data->flags |= SF_HAS_EVAL;
3514 data->whilem_c = data_fake.whilem_c;
3515 }
3516 if (f & SCF_DO_STCLASS_AND) {
3517 const int was = (data->start_class->flags & ANYOF_EOS);
3518
3519 cl_and(data->start_class, &intrnl);
3520 if (was)
3521 data->start_class->flags |= ANYOF_EOS;
3522 }
cb434fcc 3523 }
8aa23a47
YO
3524#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3525 else {
3526 /* Positive Lookahead/lookbehind
3527 In this case we can do fixed string optimisation,
3528 but we must be careful about it. Note in the case of
3529 lookbehind the positions will be offset by the minimum
3530 length of the pattern, something we won't know about
3531 until after the recurse.
3532 */
3533 I32 deltanext, fake = 0;
3534 regnode *nscan;
3535 struct regnode_charclass_class intrnl;
3536 int f = 0;
3537 /* We use SAVEFREEPV so that when the full compile
3538 is finished perl will clean up the allocated
3539 minlens when its all done. This was we don't
3540 have to worry about freeing them when we know
3541 they wont be used, which would be a pain.
3542 */
3543 I32 *minnextp;
3544 Newx( minnextp, 1, I32 );
3545 SAVEFREEPV(minnextp);
3546
3547 if (data) {
3548 StructCopy(data, &data_fake, scan_data_t);
3549 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3550 f |= SCF_DO_SUBSTR;
3551 if (scan->flags)
3552 scan_commit(pRExC_state, &data_fake,minlenp);
3553 data_fake.last_found=newSVsv(data->last_found);
3554 }
3555 }
3556 else
3557 data_fake.last_closep = &fake;
3558 data_fake.flags = 0;
58e23c8d 3559 data_fake.pos_delta = delta;
8aa23a47
YO
3560 if (is_inf)
3561 data_fake.flags |= SF_IS_INF;
3562 if ( flags & SCF_DO_STCLASS && !scan->flags
3563 && OP(scan) == IFMATCH ) { /* Lookahead */
3564 cl_init(pRExC_state, &intrnl);
3565 data_fake.start_class = &intrnl;
3566 f |= SCF_DO_STCLASS_AND;
3567 }
3568 if (flags & SCF_WHILEM_VISITED_POS)
3569 f |= SCF_WHILEM_VISITED_POS;
3570 next = regnext(scan);
3571 nscan = NEXTOPER(NEXTOPER(scan));
3572
3573 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3574 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3575 if (scan->flags) {
3576 if (deltanext) {
58e23c8d 3577 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3578 }
3579 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3580 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3581 }
3582 scan->flags = (U8)*minnextp;
3583 }
3584
3585 *minnextp += min;
3586
3587 if (f & SCF_DO_STCLASS_AND) {
3588 const int was = (data->start_class->flags & ANYOF_EOS);
3589
3590 cl_and(data->start_class, &intrnl);
3591 if (was)
3592 data->start_class->flags |= ANYOF_EOS;
3593 }
3594 if (data) {
3595 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3596 pars++;
3597 if (data_fake.flags & SF_HAS_EVAL)
3598 data->flags |= SF_HAS_EVAL;
3599 data->whilem_c = data_fake.whilem_c;
3600 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3601 if (RExC_rx->minlen<*minnextp)
3602 RExC_rx->minlen=*minnextp;
3603 scan_commit(pRExC_state, &data_fake, minnextp);
3604 SvREFCNT_dec(data_fake.last_found);
3605
3606 if ( data_fake.minlen_fixed != minlenp )
3607 {
3608 data->offset_fixed= data_fake.offset_fixed;
3609 data->minlen_fixed= data_fake.minlen_fixed;
3610 data->lookbehind_fixed+= scan->flags;
3611 }
3612 if ( data_fake.minlen_float != minlenp )
3613 {
3614 data->minlen_float= data_fake.minlen_float;
3615 data->offset_float_min=data_fake.offset_float_min;
3616 data->offset_float_max=data_fake.offset_float_max;
3617 data->lookbehind_float+= scan->flags;
3618 }
3619 }
3620 }
3621
3622
40d049e4 3623 }
8aa23a47
YO
3624#endif
3625 }
3626 else if (OP(scan) == OPEN) {
3627 if (stopparen != (I32)ARG(scan))
3628 pars++;
3629 }
3630 else if (OP(scan) == CLOSE) {
3631 if (stopparen == (I32)ARG(scan)) {
3632 break;
3633 }
3634 if ((I32)ARG(scan) == is_par) {
3635 next = regnext(scan);
b515a41d 3636
8aa23a47
YO
3637 if ( next && (OP(next) != WHILEM) && next < last)
3638 is_par = 0; /* Disable optimization */
40d049e4 3639 }
8aa23a47
YO
3640 if (data)
3641 *(data->last_closep) = ARG(scan);
3642 }
3643 else if (OP(scan) == EVAL) {
c277df42
IZ
3644 if (data)
3645 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3646 }
3647 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3648 if (flags & SCF_DO_SUBSTR) {
3649 scan_commit(pRExC_state,data,minlenp);
3650 flags &= ~SCF_DO_SUBSTR;
40d049e4 3651 }
8aa23a47
YO
3652 if (data && OP(scan)==ACCEPT) {
3653 data->flags |= SCF_SEEN_ACCEPT;
3654 if (stopmin > min)
3655 stopmin = min;
e2e6a0f1 3656 }
8aa23a47
YO
3657 }
3658 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3659 {
0f5d15d6 3660 if (flags & SCF_DO_SUBSTR) {
1de06328 3661 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3662 data->longest = &(data->longest_float);
3663 }
3664 is_inf = is_inf_internal = 1;
653099ff 3665 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3666 cl_anything(pRExC_state, data->start_class);
96776eda 3667 flags &= ~SCF_DO_STCLASS;
8aa23a47 3668 }
58e23c8d 3669 else if (OP(scan) == GPOS) {
bbe252da 3670 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3671 !(delta || is_inf || (data && data->pos_delta)))
3672 {
bbe252da
YO
3673 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3674 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3675 if (RExC_rx->gofs < (U32)min)
3676 RExC_rx->gofs = min;
3677 } else {
bbe252da 3678 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3679 RExC_rx->gofs = 0;
3680 }
3681 }
786e8c11 3682#ifdef TRIE_STUDY_OPT
40d049e4 3683#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3684 else if (PL_regkind[OP(scan)] == TRIE) {
3685 /* NOTE - There is similar code to this block above for handling
3686 BRANCH nodes on the initial study. If you change stuff here
3687 check there too. */
3688 regnode *trie_node= scan;
3689 regnode *tail= regnext(scan);
3690 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3691 I32 max1 = 0, min1 = I32_MAX;
3692 struct regnode_charclass_class accum;
3693
3694 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3695 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3696 if (flags & SCF_DO_STCLASS)
3697 cl_init_zero(pRExC_state, &accum);
3698
3699 if (!trie->jump) {
3700 min1= trie->minlen;
3701 max1= trie->maxlen;
3702 } else {
3703 const regnode *nextbranch= NULL;
3704 U32 word;
3705
3706 for ( word=1 ; word <= trie->wordcount ; word++)
3707 {
3708 I32 deltanext=0, minnext=0, f = 0, fake;
3709 struct regnode_charclass_class this_class;
3710
3711 data_fake.flags = 0;
3712 if (data) {
3713 data_fake.whilem_c = data->whilem_c;
3714 data_fake.last_closep = data->last_closep;
3715 }
3716 else
3717 data_fake.last_closep = &fake;
58e23c8d 3718 data_fake.pos_delta = delta;
8aa23a47
YO
3719 if (flags & SCF_DO_STCLASS) {
3720 cl_init(pRExC_state, &this_class);
3721 data_fake.start_class = &this_class;
3722 f = SCF_DO_STCLASS_AND;
3723 }
3724 if (flags & SCF_WHILEM_VISITED_POS)
3725 f |= SCF_WHILEM_VISITED_POS;
3726
3727 if (trie->jump[word]) {
3728 if (!nextbranch)
3729 nextbranch = trie_node + trie->jump[0];
3730 scan= trie_node + trie->jump[word];
3731 /* We go from the jump point to the branch that follows
3732 it. Note this means we need the vestigal unused branches
3733 even though they arent otherwise used.
3734 */
3735 minnext = study_chunk(pRExC_state, &scan, minlenp,
3736 &deltanext, (regnode *)nextbranch, &data_fake,
3737 stopparen, recursed, NULL, f,depth+1);
3738 }
3739 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3740 nextbranch= regnext((regnode*)nextbranch);
3741
3742 if (min1 > (I32)(minnext + trie->minlen))
3743 min1 = minnext + trie->minlen;
3744 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3745 max1 = minnext + deltanext + trie->maxlen;
3746 if (deltanext == I32_MAX)
3747 is_inf = is_inf_internal = 1;
3748
3749 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3750 pars++;
3751 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3752 if ( stopmin > min + min1)
3753 stopmin = min + min1;
3754 flags &= ~SCF_DO_SUBSTR;
3755 if (data)
3756 data->flags |= SCF_SEEN_ACCEPT;
3757 }
3758 if (data) {
3759 if (data_fake.flags & SF_HAS_EVAL)
3760 data->flags |= SF_HAS_EVAL;
3761 data->whilem_c = data_fake.whilem_c;
3762 }
3763 if (flags & SCF_DO_STCLASS)
3764 cl_or(pRExC_state, &accum, &this_class);
3765 }
3766 }
3767 if (flags & SCF_DO_SUBSTR) {
3768 data->pos_min += min1;
3769 data->pos_delta += max1 - min1;
3770 if (max1 != min1 || is_inf)
3771 data->longest = &(data->longest_float);
3772 }
3773 min += min1;
3774 delta += max1 - min1;
3775 if (flags & SCF_DO_STCLASS_OR) {
3776 cl_or(pRExC_state, data->start_class, &accum);
3777 if (min1) {
3778 cl_and(data->start_class, and_withp);
3779 flags &= ~SCF_DO_STCLASS;
3780 }
3781 }
3782 else if (flags & SCF_DO_STCLASS_AND) {
3783 if (min1) {
3784 cl_and(data->start_class, &accum);
3785 flags &= ~SCF_DO_STCLASS;
3786 }
3787 else {
3788 /* Switch to OR mode: cache the old value of
3789 * data->start_class */
3790 INIT_AND_WITHP;
3791 StructCopy(data->start_class, and_withp,
3792 struct regnode_charclass_class);
3793 flags &= ~SCF_DO_STCLASS_AND;
3794 StructCopy(&accum, data->start_class,
3795 struct regnode_charclass_class);
3796 flags |= SCF_DO_STCLASS_OR;
3797 data->start_class->flags |= ANYOF_EOS;
3798 }
3799 }
3800 scan= tail;
3801 continue;
3802 }
786e8c11 3803#else
8aa23a47
YO
3804 else if (PL_regkind[OP(scan)] == TRIE) {
3805 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3806 U8*bang=NULL;
3807
3808 min += trie->minlen;
3809 delta += (trie->maxlen - trie->minlen);
3810 flags &= ~SCF_DO_STCLASS; /* xxx */
3811 if (flags & SCF_DO_SUBSTR) {
3812 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3813 data->pos_min += trie->minlen;
3814 data->pos_delta += (trie->maxlen - trie->minlen);
3815 if (trie->maxlen != trie->minlen)
3816 data->longest = &(data->longest_float);
3817 }
3818 if (trie->jump) /* no more substrings -- for now /grr*/
3819 flags &= ~SCF_DO_SUBSTR;
b515a41d 3820 }
8aa23a47
YO
3821#endif /* old or new */
3822#endif /* TRIE_STUDY_OPT */
3823 /* Else: zero-length, ignore. */
3824 scan = regnext(scan);
3825 }
3826 if (frame) {
3827 last = frame->last;
3828 scan = frame->next;
3829 stopparen = frame->stop;
3830 frame = frame->prev;
3831 goto fake_study_recurse;
c277df42
IZ
3832 }
3833
3834 finish:
8aa23a47
YO
3835 assert(!frame);
3836
c277df42 3837 *scanp = scan;
aca2d497 3838 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3839 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3840 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3841 if (is_par > (I32)U8_MAX)
c277df42
IZ
3842 is_par = 0;
3843 if (is_par && pars==1 && data) {
3844 data->flags |= SF_IN_PAR;
3845 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3846 }
3847 else if (pars && data) {
c277df42
IZ
3848 data->flags |= SF_HAS_PAR;
3849 data->flags &= ~SF_IN_PAR;
3850 }
653099ff 3851 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3852 cl_and(data->start_class, and_withp);
786e8c11
YO
3853 if (flags & SCF_TRIE_RESTUDY)
3854 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3855
3856 DEBUG_STUDYDATA(data,depth);
3857
e2e6a0f1 3858 return min < stopmin ? min : stopmin;
c277df42
IZ
3859}
3860
76e3520e 3861STATIC I32
5f66b61c 3862S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3863{
830247a4 3864 if (RExC_rx->data) {
2eb97020 3865 const U32 count = RExC_rx->data->count;
b81d288d 3866 Renewc(RExC_rx->data,
2eb97020 3867 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3868 char, struct reg_data);
2eb97020 3869 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3870 RExC_rx->data->count += n;
a0ed51b3
LW