This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Actually, only changes to regexec.c from #30081 needed to be reverted.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
154#ifdef RE_TRACK_PATTERN_OFFSETS
155#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
156#endif
830247a4 157#define RExC_emit (pRExC_state->emit)
fac92740 158#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
159#define RExC_naughty (pRExC_state->naughty)
160#define RExC_sawback (pRExC_state->sawback)
161#define RExC_seen (pRExC_state->seen)
162#define RExC_size (pRExC_state->size)
163#define RExC_npar (pRExC_state->npar)
e2e6a0f1 164#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
165#define RExC_extralen (pRExC_state->extralen)
166#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
167#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 168#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 169#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
170#define RExC_open_parens (pRExC_state->open_parens)
171#define RExC_close_parens (pRExC_state->close_parens)
172#define RExC_opend (pRExC_state->opend)
81714fb9 173#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
174#define RExC_recurse (pRExC_state->recurse)
175#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 176
cde0cee5 177
a687059c
LW
178#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
179#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
180 ((*s) == '{' && regcurly(s)))
a687059c 181
35c8bce7
LW
182#ifdef SPSTART
183#undef SPSTART /* dratted cpp namespace... */
184#endif
a687059c
LW
185/*
186 * Flags to be passed up and down.
187 */
a687059c 188#define WORST 0 /* Worst case. */
a3b492c3
YO
189#define HASWIDTH 0x01 /* Known to match non-null strings. */
190#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
191#define SPSTART 0x04 /* Starts with * or +. */
192#define TRYAGAIN 0x08 /* Weeded out a declaration. */
193#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 194
3dab1dad
YO
195#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
196
07be1b83
YO
197/* whether trie related optimizations are enabled */
198#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
199#define TRIE_STUDY_OPT
786e8c11 200#define FULL_TRIE_STUDY
07be1b83
YO
201#define TRIE_STCLASS
202#endif
1de06328
YO
203
204
40d049e4
YO
205
206#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
207#define PBITVAL(paren) (1 << ((paren) & 7))
208#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
209#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
210#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
211
212
1de06328
YO
213/* About scan_data_t.
214
215 During optimisation we recurse through the regexp program performing
216 various inplace (keyhole style) optimisations. In addition study_chunk
217 and scan_commit populate this data structure with information about
218 what strings MUST appear in the pattern. We look for the longest
219 string that must appear for at a fixed location, and we look for the
220 longest string that may appear at a floating location. So for instance
221 in the pattern:
222
223 /FOO[xX]A.*B[xX]BAR/
224
225 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
226 strings (because they follow a .* construct). study_chunk will identify
227 both FOO and BAR as being the longest fixed and floating strings respectively.
228
229 The strings can be composites, for instance
230
231 /(f)(o)(o)/
232
233 will result in a composite fixed substring 'foo'.
234
235 For each string some basic information is maintained:
236
237 - offset or min_offset
238 This is the position the string must appear at, or not before.
239 It also implicitly (when combined with minlenp) tells us how many
240 character must match before the string we are searching.
241 Likewise when combined with minlenp and the length of the string
242 tells us how many characters must appear after the string we have
243 found.
244
245 - max_offset
246 Only used for floating strings. This is the rightmost point that
247 the string can appear at. Ifset to I32 max it indicates that the
248 string can occur infinitely far to the right.
249
250 - minlenp
251 A pointer to the minimum length of the pattern that the string
252 was found inside. This is important as in the case of positive
253 lookahead or positive lookbehind we can have multiple patterns
254 involved. Consider
255
256 /(?=FOO).*F/
257
258 The minimum length of the pattern overall is 3, the minimum length
259 of the lookahead part is 3, but the minimum length of the part that
260 will actually match is 1. So 'FOO's minimum length is 3, but the
261 minimum length for the F is 1. This is important as the minimum length
262 is used to determine offsets in front of and behind the string being
263 looked for. Since strings can be composites this is the length of the
264 pattern at the time it was commited with a scan_commit. Note that
265 the length is calculated by study_chunk, so that the minimum lengths
266 are not known until the full pattern has been compiled, thus the
267 pointer to the value.
268
269 - lookbehind
270
271 In the case of lookbehind the string being searched for can be
272 offset past the start point of the final matching string.
273 If this value was just blithely removed from the min_offset it would
274 invalidate some of the calculations for how many chars must match
275 before or after (as they are derived from min_offset and minlen and
276 the length of the string being searched for).
277 When the final pattern is compiled and the data is moved from the
278 scan_data_t structure into the regexp structure the information
279 about lookbehind is factored in, with the information that would
280 have been lost precalculated in the end_shift field for the
281 associated string.
282
283 The fields pos_min and pos_delta are used to store the minimum offset
284 and the delta to the maximum offset at the current point in the pattern.
285
286*/
2c2d71f5
JH
287
288typedef struct scan_data_t {
1de06328
YO
289 /*I32 len_min; unused */
290 /*I32 len_delta; unused */
2c2d71f5
JH
291 I32 pos_min;
292 I32 pos_delta;
293 SV *last_found;
1de06328 294 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
295 I32 last_start_min;
296 I32 last_start_max;
1de06328
YO
297 SV **longest; /* Either &l_fixed, or &l_float. */
298 SV *longest_fixed; /* longest fixed string found in pattern */
299 I32 offset_fixed; /* offset where it starts */
300 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
301 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
302 SV *longest_float; /* longest floating string found in pattern */
303 I32 offset_float_min; /* earliest point in string it can appear */
304 I32 offset_float_max; /* latest point in string it can appear */
305 I32 *minlen_float; /* pointer to the minlen relevent to the string */
306 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
307 I32 flags;
308 I32 whilem_c;
cb434fcc 309 I32 *last_closep;
653099ff 310 struct regnode_charclass_class *start_class;
2c2d71f5
JH
311} scan_data_t;
312
a687059c 313/*
e50aee73 314 * Forward declarations for pregcomp()'s friends.
a687059c 315 */
a0d0e21e 316
27da23d5 317static const scan_data_t zero_scan_data =
1de06328 318 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
319
320#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
321#define SF_BEFORE_SEOL 0x0001
322#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
323#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
324#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
325
09b7f37c
CB
326#ifdef NO_UNARY_PLUS
327# define SF_FIX_SHIFT_EOL (0+2)
328# define SF_FL_SHIFT_EOL (0+4)
329#else
330# define SF_FIX_SHIFT_EOL (+2)
331# define SF_FL_SHIFT_EOL (+4)
332#endif
c277df42
IZ
333
334#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
335#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
336
337#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
338#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
339#define SF_IS_INF 0x0040
340#define SF_HAS_PAR 0x0080
341#define SF_IN_PAR 0x0100
342#define SF_HAS_EVAL 0x0200
343#define SCF_DO_SUBSTR 0x0400
653099ff
GS
344#define SCF_DO_STCLASS_AND 0x0800
345#define SCF_DO_STCLASS_OR 0x1000
346#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 347#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 348
786e8c11 349#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 350#define SCF_SEEN_ACCEPT 0x8000
07be1b83 351
eb160463 352#define UTF (RExC_utf8 != 0)
bbe252da
YO
353#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
354#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 355
ffc61ed2 356#define OOB_UNICODE 12345678
93733859 357#define OOB_NAMEDCLASS -1
b8c5462f 358
a0ed51b3
LW
359#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
360#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
361
8615cb43 362
b45f050a
JF
363/* length of regex to show in messages that don't mark a position within */
364#define RegexLengthToShowInErrorMessages 127
365
366/*
367 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
368 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
369 * op/pragma/warn/regcomp.
370 */
7253e4e3
RK
371#define MARKER1 "<-- HERE" /* marker as it appears in the description */
372#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 373
7253e4e3 374#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
375
376/*
377 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
378 * arg. Show regex, up to a maximum length. If it's too long, chop and add
379 * "...".
380 */
58e23c8d 381#define _FAIL(code) STMT_START { \
bfed75c6 382 const char *ellipses = ""; \
ccb2c380
MP
383 IV len = RExC_end - RExC_precomp; \
384 \
385 if (!SIZE_ONLY) \
386 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
387 if (len > RegexLengthToShowInErrorMessages) { \
388 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
389 len = RegexLengthToShowInErrorMessages - 10; \
390 ellipses = "..."; \
391 } \
58e23c8d 392 code; \
ccb2c380 393} STMT_END
8615cb43 394
58e23c8d
YO
395#define FAIL(msg) _FAIL( \
396 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
397 msg, (int)len, RExC_precomp, ellipses))
398
399#define FAIL2(msg,arg) _FAIL( \
400 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
401 arg, (int)len, RExC_precomp, ellipses))
402
b45f050a 403/*
b45f050a
JF
404 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
405 */
ccb2c380 406#define Simple_vFAIL(m) STMT_START { \
a28509cc 407 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
408 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
409 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
b45f050a
JF
411
412/*
413 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
414 */
ccb2c380
MP
415#define vFAIL(m) STMT_START { \
416 if (!SIZE_ONLY) \
417 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
418 Simple_vFAIL(m); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Like Simple_vFAIL(), but accepts two arguments.
423 */
ccb2c380 424#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
432 */
ccb2c380
MP
433#define vFAIL2(m,a1) STMT_START { \
434 if (!SIZE_ONLY) \
435 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
436 Simple_vFAIL2(m, a1); \
437} STMT_END
b45f050a
JF
438
439
440/*
441 * Like Simple_vFAIL(), but accepts three arguments.
442 */
ccb2c380 443#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 444 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
445 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
446 (int)offset, RExC_precomp, RExC_precomp + offset); \
447} STMT_END
b45f050a
JF
448
449/*
450 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
451 */
ccb2c380
MP
452#define vFAIL3(m,a1,a2) STMT_START { \
453 if (!SIZE_ONLY) \
454 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
455 Simple_vFAIL3(m, a1, a2); \
456} STMT_END
b45f050a
JF
457
458/*
459 * Like Simple_vFAIL(), but accepts four arguments.
460 */
ccb2c380 461#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 462 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
463 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
464 (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
b45f050a 466
ccb2c380 467#define vWARN(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
470 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
471} STMT_END
472
473#define vWARNdep(loc,m) STMT_START { \
a28509cc 474 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
475 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
476 "%s" REPORT_LOCATION, \
477 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480
481#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
498
499#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 500 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
503} STMT_END
9d1d55b5 504
8615cb43 505
cd439c50 506/* Allow for side effects in s */
ccb2c380
MP
507#define REGC(c,s) STMT_START { \
508 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
509} STMT_END
cd439c50 510
fac92740
MJD
511/* Macros for recording node offsets. 20001227 mjd@plover.com
512 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
513 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
514 * Element 0 holds the number n.
07be1b83 515 * Position is 1 indexed.
fac92740 516 */
7122b237
YO
517#ifndef RE_TRACK_PATTERN_OFFSETS
518#define Set_Node_Offset_To_R(node,byte)
519#define Set_Node_Offset(node,byte)
520#define Set_Cur_Node_Offset
521#define Set_Node_Length_To_R(node,len)
522#define Set_Node_Length(node,len)
523#define Set_Node_Cur_Length(node)
524#define Node_Offset(n)
525#define Node_Length(n)
526#define Set_Node_Offset_Length(node,offset,len)
527#define ProgLen(ri) ri->u.proglen
528#define SetProgLen(ri,x) ri->u.proglen = x
529#else
530#define ProgLen(ri) ri->u.offsets[0]
531#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
532#define Set_Node_Offset_To_R(node,byte) STMT_START { \
533 if (! SIZE_ONLY) { \
534 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 535 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 536 if((node) < 0) { \
551405c4 537 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
538 } else { \
539 RExC_offsets[2*(node)-1] = (byte); \
540 } \
541 } \
542} STMT_END
543
544#define Set_Node_Offset(node,byte) \
545 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
546#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
547
548#define Set_Node_Length_To_R(node,len) STMT_START { \
549 if (! SIZE_ONLY) { \
550 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 551 __LINE__, (int)(node), (int)(len))); \
ccb2c380 552 if((node) < 0) { \
551405c4 553 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
554 } else { \
555 RExC_offsets[2*(node)] = (len); \
556 } \
557 } \
558} STMT_END
559
560#define Set_Node_Length(node,len) \
561 Set_Node_Length_To_R((node)-RExC_emit_start, len)
562#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
563#define Set_Node_Cur_Length(node) \
564 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
565
566/* Get offsets and lengths */
567#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
568#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
569
07be1b83
YO
570#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
571 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
572 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
573} STMT_END
7122b237 574#endif
07be1b83
YO
575
576#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
577#define EXPERIMENTAL_INPLACESCAN
7122b237 578#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 579
304ee84b
YO
580#define DEBUG_STUDYDATA(str,data,depth) \
581DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 582 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
583 "%*s" str "Pos:%"IVdf"/%"IVdf \
584 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
585 (int)(depth)*2, "", \
586 (IV)((data)->pos_min), \
587 (IV)((data)->pos_delta), \
304ee84b 588 (UV)((data)->flags), \
1de06328 589 (IV)((data)->whilem_c), \
304ee84b
YO
590 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
591 is_inf ? "INF " : "" \
1de06328
YO
592 ); \
593 if ((data)->last_found) \
594 PerlIO_printf(Perl_debug_log, \
595 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
596 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
597 SvPVX_const((data)->last_found), \
598 (IV)((data)->last_end), \
599 (IV)((data)->last_start_min), \
600 (IV)((data)->last_start_max), \
601 ((data)->longest && \
602 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
603 SvPVX_const((data)->longest_fixed), \
604 (IV)((data)->offset_fixed), \
605 ((data)->longest && \
606 (data)->longest==&((data)->longest_float)) ? "*" : "", \
607 SvPVX_const((data)->longest_float), \
608 (IV)((data)->offset_float_min), \
609 (IV)((data)->offset_float_max) \
610 ); \
611 PerlIO_printf(Perl_debug_log,"\n"); \
612});
613
acfe0abc 614static void clear_re(pTHX_ void *r);
4327152a 615
653099ff 616/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 617 Update the longest found anchored substring and the longest found
653099ff
GS
618 floating substrings if needed. */
619
4327152a 620STATIC void
304ee84b 621S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 622{
e1ec3a88
AL
623 const STRLEN l = CHR_SVLEN(data->last_found);
624 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 625 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 626
c277df42 627 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 628 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
629 if (*data->longest == data->longest_fixed) {
630 data->offset_fixed = l ? data->last_start_min : data->pos_min;
631 if (data->flags & SF_BEFORE_EOL)
b81d288d 632 data->flags
c277df42
IZ
633 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
634 else
635 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
636 data->minlen_fixed=minlenp;
637 data->lookbehind_fixed=0;
a0ed51b3 638 }
304ee84b 639 else { /* *data->longest == data->longest_float */
c277df42 640 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
641 data->offset_float_max = (l
642 ? data->last_start_max
c277df42 643 : data->pos_min + data->pos_delta);
304ee84b 644 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 645 data->offset_float_max = I32_MAX;
c277df42 646 if (data->flags & SF_BEFORE_EOL)
b81d288d 647 data->flags
c277df42
IZ
648 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
649 else
650 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
651 data->minlen_float=minlenp;
652 data->lookbehind_float=0;
c277df42
IZ
653 }
654 }
655 SvCUR_set(data->last_found, 0);
0eda9292 656 {
a28509cc 657 SV * const sv = data->last_found;
097eb12c
AL
658 if (SvUTF8(sv) && SvMAGICAL(sv)) {
659 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
660 if (mg)
661 mg->mg_len = 0;
662 }
0eda9292 663 }
c277df42
IZ
664 data->last_end = -1;
665 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 666 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
667}
668
653099ff
GS
669/* Can match anything (initialization) */
670STATIC void
097eb12c 671S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 672{
653099ff 673 ANYOF_CLASS_ZERO(cl);
f8bef550 674 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 675 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
676 if (LOC)
677 cl->flags |= ANYOF_LOCALE;
678}
679
680/* Can match anything (initialization) */
681STATIC int
5f66b61c 682S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
683{
684 int value;
685
aaa51d5e 686 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
687 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
688 return 1;
1aa99e6b
IH
689 if (!(cl->flags & ANYOF_UNICODE_ALL))
690 return 0;
10edeb5d 691 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 692 return 0;
653099ff
GS
693 return 1;
694}
695
696/* Can match anything (initialization) */
697STATIC void
097eb12c 698S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 699{
8ecf7187 700 Zero(cl, 1, struct regnode_charclass_class);
653099ff 701 cl->type = ANYOF;
830247a4 702 cl_anything(pRExC_state, cl);
653099ff
GS
703}
704
705STATIC void
097eb12c 706S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 707{
8ecf7187 708 Zero(cl, 1, struct regnode_charclass_class);
653099ff 709 cl->type = ANYOF;
830247a4 710 cl_anything(pRExC_state, cl);
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
713}
714
715/* 'And' a given class with another one. Can create false positives */
716/* We assume that cl is not inverted */
717STATIC void
5f66b61c 718S_cl_and(struct regnode_charclass_class *cl,
a28509cc 719 const struct regnode_charclass_class *and_with)
653099ff 720{
40d049e4
YO
721
722 assert(and_with->type == ANYOF);
653099ff
GS
723 if (!(and_with->flags & ANYOF_CLASS)
724 && !(cl->flags & ANYOF_CLASS)
725 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
726 && !(and_with->flags & ANYOF_FOLD)
727 && !(cl->flags & ANYOF_FOLD)) {
728 int i;
729
730 if (and_with->flags & ANYOF_INVERT)
731 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
732 cl->bitmap[i] &= ~and_with->bitmap[i];
733 else
734 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
735 cl->bitmap[i] &= and_with->bitmap[i];
736 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
737 if (!(and_with->flags & ANYOF_EOS))
738 cl->flags &= ~ANYOF_EOS;
1aa99e6b 739
14ebb1a2
JH
740 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
741 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
742 cl->flags &= ~ANYOF_UNICODE_ALL;
743 cl->flags |= ANYOF_UNICODE;
744 ARG_SET(cl, ARG(and_with));
745 }
14ebb1a2
JH
746 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
747 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 748 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
749 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
750 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 751 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
752}
753
754/* 'OR' a given class with another one. Can create false positives */
755/* We assume that cl is not inverted */
756STATIC void
097eb12c 757S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 758{
653099ff
GS
759 if (or_with->flags & ANYOF_INVERT) {
760 /* We do not use
761 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
762 * <= (B1 | !B2) | (CL1 | !CL2)
763 * which is wasteful if CL2 is small, but we ignore CL2:
764 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
765 * XXXX Can we handle case-fold? Unclear:
766 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
767 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
768 */
769 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
770 && !(or_with->flags & ANYOF_FOLD)
771 && !(cl->flags & ANYOF_FOLD) ) {
772 int i;
773
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] |= ~or_with->bitmap[i];
776 } /* XXXX: logic is complicated otherwise */
777 else {
830247a4 778 cl_anything(pRExC_state, cl);
653099ff
GS
779 }
780 } else {
781 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
782 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 783 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
784 || (cl->flags & ANYOF_FOLD)) ) {
785 int i;
786
787 /* OR char bitmap and class bitmap separately */
788 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
789 cl->bitmap[i] |= or_with->bitmap[i];
790 if (or_with->flags & ANYOF_CLASS) {
791 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
792 cl->classflags[i] |= or_with->classflags[i];
793 cl->flags |= ANYOF_CLASS;
794 }
795 }
796 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 797 cl_anything(pRExC_state, cl);
653099ff
GS
798 }
799 }
800 if (or_with->flags & ANYOF_EOS)
801 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
802
803 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
804 ARG(cl) != ARG(or_with)) {
805 cl->flags |= ANYOF_UNICODE_ALL;
806 cl->flags &= ~ANYOF_UNICODE;
807 }
808 if (or_with->flags & ANYOF_UNICODE_ALL) {
809 cl->flags |= ANYOF_UNICODE_ALL;
810 cl->flags &= ~ANYOF_UNICODE;
811 }
653099ff
GS
812}
813
a3621e74
YO
814#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
815#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
816#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
817#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
818
3dab1dad
YO
819
820#ifdef DEBUGGING
07be1b83 821/*
2b8b4781
NC
822 dump_trie(trie,widecharmap,revcharmap)
823 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
824 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
825
826 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
827 The _interim_ variants are used for debugging the interim
828 tables that are used to generate the final compressed
829 representation which is what dump_trie expects.
830
3dab1dad
YO
831 Part of the reason for their existance is to provide a form
832 of documentation as to how the different representations function.
07be1b83
YO
833
834*/
3dab1dad
YO
835
836/*
3dab1dad
YO
837 Dumps the final compressed table form of the trie to Perl_debug_log.
838 Used for debugging make_trie().
839*/
840
841STATIC void
2b8b4781
NC
842S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
843 AV *revcharmap, U32 depth)
3dab1dad
YO
844{
845 U32 state;
ab3bbdeb 846 SV *sv=sv_newmortal();
55eed653 847 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
848 GET_RE_DEBUG_FLAGS_DECL;
849
ab3bbdeb 850
3dab1dad
YO
851 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
852 (int)depth * 2 + 2,"",
853 "Match","Base","Ofs" );
854
855 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 856 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 857 if ( tmp ) {
ab3bbdeb
YO
858 PerlIO_printf( Perl_debug_log, "%*s",
859 colwidth,
ddc5bc0f 860 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
861 PL_colors[0], PL_colors[1],
862 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
863 PERL_PV_ESCAPE_FIRSTCHAR
864 )
865 );
3dab1dad
YO
866 }
867 }
868 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
869 (int)depth * 2 + 2,"");
870
871 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 872 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
873 PerlIO_printf( Perl_debug_log, "\n");
874
1e2e3d02 875 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 876 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
877
878 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879
880 if ( trie->states[ state ].wordnum ) {
881 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 } else {
883 PerlIO_printf( Perl_debug_log, "%6s", "" );
884 }
885
886 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
887
888 if ( base ) {
889 U32 ofs = 0;
890
891 while( ( base + ofs < trie->uniquecharcount ) ||
892 ( base + ofs - trie->uniquecharcount < trie->lasttrans
893 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
894 ofs++;
895
896 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897
898 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
899 if ( ( base + ofs >= trie->uniquecharcount ) &&
900 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
901 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 {
ab3bbdeb
YO
903 PerlIO_printf( Perl_debug_log, "%*"UVXf,
904 colwidth,
3dab1dad
YO
905 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
906 } else {
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
908 }
909 }
910
911 PerlIO_printf( Perl_debug_log, "]");
912
913 }
914 PerlIO_printf( Perl_debug_log, "\n" );
915 }
916}
917/*
3dab1dad
YO
918 Dumps a fully constructed but uncompressed trie in list form.
919 List tries normally only are used for construction when the number of
920 possible chars (trie->uniquecharcount) is very high.
921 Used for debugging make_trie().
922*/
923STATIC void
55eed653 924S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
925 HV *widecharmap, AV *revcharmap, U32 next_alloc,
926 U32 depth)
3dab1dad
YO
927{
928 U32 state;
ab3bbdeb 929 SV *sv=sv_newmortal();
55eed653 930 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
931 GET_RE_DEBUG_FLAGS_DECL;
932 /* print out the table precompression. */
ab3bbdeb
YO
933 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
934 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
935 "------:-----+-----------------\n" );
3dab1dad
YO
936
937 for( state=1 ; state < next_alloc ; state ++ ) {
938 U16 charid;
939
ab3bbdeb 940 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
941 (int)depth * 2 + 2,"", (UV)state );
942 if ( ! trie->states[ state ].wordnum ) {
943 PerlIO_printf( Perl_debug_log, "%5s| ","");
944 } else {
945 PerlIO_printf( Perl_debug_log, "W%4x| ",
946 trie->states[ state ].wordnum
947 );
948 }
949 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 950 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
951 if ( tmp ) {
952 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
953 colwidth,
ddc5bc0f 954 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
955 PL_colors[0], PL_colors[1],
956 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
957 PERL_PV_ESCAPE_FIRSTCHAR
958 ) ,
1e2e3d02
YO
959 TRIE_LIST_ITEM(state,charid).forid,
960 (UV)TRIE_LIST_ITEM(state,charid).newstate
961 );
962 if (!(charid % 10))
664e119d
RGS
963 PerlIO_printf(Perl_debug_log, "\n%*s| ",
964 (int)((depth * 2) + 14), "");
1e2e3d02 965 }
ab3bbdeb
YO
966 }
967 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
968 }
969}
970
971/*
3dab1dad
YO
972 Dumps a fully constructed but uncompressed trie in table form.
973 This is the normal DFA style state transition table, with a few
974 twists to facilitate compression later.
975 Used for debugging make_trie().
976*/
977STATIC void
55eed653 978S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
979 HV *widecharmap, AV *revcharmap, U32 next_alloc,
980 U32 depth)
3dab1dad
YO
981{
982 U32 state;
983 U16 charid;
ab3bbdeb 984 SV *sv=sv_newmortal();
55eed653 985 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
986 GET_RE_DEBUG_FLAGS_DECL;
987
988 /*
989 print out the table precompression so that we can do a visual check
990 that they are identical.
991 */
992
993 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
994
995 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 996 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 997 if ( tmp ) {
ab3bbdeb
YO
998 PerlIO_printf( Perl_debug_log, "%*s",
999 colwidth,
ddc5bc0f 1000 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1001 PL_colors[0], PL_colors[1],
1002 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1003 PERL_PV_ESCAPE_FIRSTCHAR
1004 )
1005 );
3dab1dad
YO
1006 }
1007 }
1008
1009 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1010
1011 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1012 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1013 }
1014
1015 PerlIO_printf( Perl_debug_log, "\n" );
1016
1017 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1018
1019 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1020 (int)depth * 2 + 2,"",
1021 (UV)TRIE_NODENUM( state ) );
1022
1023 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1024 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1025 if (v)
1026 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1027 else
1028 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1029 }
1030 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1031 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1032 } else {
1033 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1034 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1035 }
1036 }
07be1b83 1037}
3dab1dad
YO
1038
1039#endif
1040
786e8c11
YO
1041/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1042 startbranch: the first branch in the whole branch sequence
1043 first : start branch of sequence of branch-exact nodes.
1044 May be the same as startbranch
1045 last : Thing following the last branch.
1046 May be the same as tail.
1047 tail : item following the branch sequence
1048 count : words in the sequence
1049 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1050 depth : indent depth
3dab1dad 1051
786e8c11 1052Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1053
786e8c11
YO
1054A trie is an N'ary tree where the branches are determined by digital
1055decomposition of the key. IE, at the root node you look up the 1st character and
1056follow that branch repeat until you find the end of the branches. Nodes can be
1057marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1058
786e8c11 1059 /he|she|his|hers/
72f13be8 1060
786e8c11
YO
1061would convert into the following structure. Numbers represent states, letters
1062following numbers represent valid transitions on the letter from that state, if
1063the number is in square brackets it represents an accepting state, otherwise it
1064will be in parenthesis.
07be1b83 1065
786e8c11
YO
1066 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1067 | |
1068 | (2)
1069 | |
1070 (1) +-i->(6)-+-s->[7]
1071 |
1072 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1073
786e8c11
YO
1074 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1075
1076This shows that when matching against the string 'hers' we will begin at state 1
1077read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1078then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1079is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1080single traverse. We store a mapping from accepting to state to which word was
1081matched, and then when we have multiple possibilities we try to complete the
1082rest of the regex in the order in which they occured in the alternation.
1083
1084The only prior NFA like behaviour that would be changed by the TRIE support is
1085the silent ignoring of duplicate alternations which are of the form:
1086
1087 / (DUPE|DUPE) X? (?{ ... }) Y /x
1088
1089Thus EVAL blocks follwing a trie may be called a different number of times with
1090and without the optimisation. With the optimisations dupes will be silently
1091ignored. This inconsistant behaviour of EVAL type nodes is well established as
1092the following demonstrates:
1093
1094 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1095
1096which prints out 'word' three times, but
1097
1098 'words'=~/(word|word|word)(?{ print $1 })S/
1099
1100which doesnt print it out at all. This is due to other optimisations kicking in.
1101
1102Example of what happens on a structural level:
1103
1104The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1105
1106 1: CURLYM[1] {1,32767}(18)
1107 5: BRANCH(8)
1108 6: EXACT <ac>(16)
1109 8: BRANCH(11)
1110 9: EXACT <ad>(16)
1111 11: BRANCH(14)
1112 12: EXACT <ab>(16)
1113 16: SUCCEED(0)
1114 17: NOTHING(18)
1115 18: END(0)
1116
1117This would be optimizable with startbranch=5, first=5, last=16, tail=16
1118and should turn into:
1119
1120 1: CURLYM[1] {1,32767}(18)
1121 5: TRIE(16)
1122 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1123 <ac>
1124 <ad>
1125 <ab>
1126 16: SUCCEED(0)
1127 17: NOTHING(18)
1128 18: END(0)
1129
1130Cases where tail != last would be like /(?foo|bar)baz/:
1131
1132 1: BRANCH(4)
1133 2: EXACT <foo>(8)
1134 4: BRANCH(7)
1135 5: EXACT <bar>(8)
1136 7: TAIL(8)
1137 8: EXACT <baz>(10)
1138 10: END(0)
1139
1140which would be optimizable with startbranch=1, first=1, last=7, tail=8
1141and would end up looking like:
1142
1143 1: TRIE(8)
1144 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1145 <foo>
1146 <bar>
1147 7: TAIL(8)
1148 8: EXACT <baz>(10)
1149 10: END(0)
1150
1151 d = uvuni_to_utf8_flags(d, uv, 0);
1152
1153is the recommended Unicode-aware way of saying
1154
1155 *(d++) = uv;
1156*/
1157
1e2e3d02 1158#define TRIE_STORE_REVCHAR \
786e8c11 1159 STMT_START { \
1e2e3d02 1160 SV *tmp = newSVpvs(""); \
786e8c11 1161 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1162 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1163 av_push( revcharmap, tmp ); \
786e8c11
YO
1164 } STMT_END
1165
1166#define TRIE_READ_CHAR STMT_START { \
1167 wordlen++; \
1168 if ( UTF ) { \
1169 if ( folder ) { \
1170 if ( foldlen > 0 ) { \
1171 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1172 foldlen -= len; \
1173 scan += len; \
1174 len = 0; \
1175 } else { \
1176 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1177 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1178 foldlen -= UNISKIP( uvc ); \
1179 scan = foldbuf + UNISKIP( uvc ); \
1180 } \
1181 } else { \
1182 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1183 } \
1184 } else { \
1185 uvc = (U32)*uc; \
1186 len = 1; \
1187 } \
1188} STMT_END
1189
1190
1191
1192#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1193 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1194 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1195 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1196 } \
1197 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1198 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1199 TRIE_LIST_CUR( state )++; \
1200} STMT_END
07be1b83 1201
786e8c11
YO
1202#define TRIE_LIST_NEW(state) STMT_START { \
1203 Newxz( trie->states[ state ].trans.list, \
1204 4, reg_trie_trans_le ); \
1205 TRIE_LIST_CUR( state ) = 1; \
1206 TRIE_LIST_LEN( state ) = 4; \
1207} STMT_END
07be1b83 1208
786e8c11
YO
1209#define TRIE_HANDLE_WORD(state) STMT_START { \
1210 U16 dupe= trie->states[ state ].wordnum; \
1211 regnode * const noper_next = regnext( noper ); \
1212 \
1213 if (trie->wordlen) \
1214 trie->wordlen[ curword ] = wordlen; \
1215 DEBUG_r({ \
1216 /* store the word for dumping */ \
1217 SV* tmp; \
1218 if (OP(noper) != NOTHING) \
1219 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1220 else \
1221 tmp = newSVpvn( "", 0 ); \
1222 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1223 av_push( trie_words, tmp ); \
786e8c11
YO
1224 }); \
1225 \
1226 curword++; \
1227 \
1228 if ( noper_next < tail ) { \
1229 if (!trie->jump) \
c944940b 1230 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1231 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1232 if (!jumper) \
1233 jumper = noper_next; \
1234 if (!nextbranch) \
1235 nextbranch= regnext(cur); \
1236 } \
1237 \
1238 if ( dupe ) { \
1239 /* So it's a dupe. This means we need to maintain a */\
1240 /* linked-list from the first to the next. */\
1241 /* we only allocate the nextword buffer when there */\
1242 /* a dupe, so first time we have to do the allocation */\
1243 if (!trie->nextword) \
c944940b 1244 trie->nextword = (U16 *) \
446bd890 1245 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1246 while ( trie->nextword[dupe] ) \
1247 dupe= trie->nextword[dupe]; \
1248 trie->nextword[dupe]= curword; \
1249 } else { \
1250 /* we haven't inserted this word yet. */ \
1251 trie->states[ state ].wordnum = curword; \
1252 } \
1253} STMT_END
07be1b83 1254
3dab1dad 1255
786e8c11
YO
1256#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1257 ( ( base + charid >= ucharcount \
1258 && base + charid < ubound \
1259 && state == trie->trans[ base - ucharcount + charid ].check \
1260 && trie->trans[ base - ucharcount + charid ].next ) \
1261 ? trie->trans[ base - ucharcount + charid ].next \
1262 : ( state==1 ? special : 0 ) \
1263 )
3dab1dad 1264
786e8c11
YO
1265#define MADE_TRIE 1
1266#define MADE_JUMP_TRIE 2
1267#define MADE_EXACT_TRIE 4
3dab1dad 1268
a3621e74 1269STATIC I32
786e8c11 1270S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1271{
27da23d5 1272 dVAR;
a3621e74
YO
1273 /* first pass, loop through and scan words */
1274 reg_trie_data *trie;
55eed653 1275 HV *widecharmap = NULL;
2b8b4781 1276 AV *revcharmap = newAV();
a3621e74 1277 regnode *cur;
9f7f3913 1278 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1279 STRLEN len = 0;
1280 UV uvc = 0;
1281 U16 curword = 0;
1282 U32 next_alloc = 0;
786e8c11
YO
1283 regnode *jumper = NULL;
1284 regnode *nextbranch = NULL;
7f69552c 1285 regnode *convert = NULL;
a3621e74 1286 /* we just use folder as a flag in utf8 */
e1ec3a88 1287 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1288 ? PL_fold
1289 : ( flags == EXACTFL
1290 ? PL_fold_locale
1291 : NULL
1292 )
1293 );
1294
2b8b4781
NC
1295#ifdef DEBUGGING
1296 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1297 AV *trie_words = NULL;
1298 /* along with revcharmap, this only used during construction but both are
1299 * useful during debugging so we store them in the struct when debugging.
8e11feef 1300 */
2b8b4781
NC
1301#else
1302 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1303 STRLEN trie_charcount=0;
3dab1dad 1304#endif
2b8b4781 1305 SV *re_trie_maxbuff;
a3621e74 1306 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1307#ifndef DEBUGGING
1308 PERL_UNUSED_ARG(depth);
1309#endif
a3621e74 1310
c944940b 1311 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1312 trie->refcount = 1;
3dab1dad 1313 trie->startstate = 1;
786e8c11 1314 trie->wordcount = word_count;
f8fc2ecf 1315 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1316 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1317 if (!(UTF && folder))
c944940b 1318 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1319 DEBUG_r({
2b8b4781 1320 trie_words = newAV();
a3621e74 1321 });
a3621e74 1322
0111c4fd 1323 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1324 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1325 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1326 }
3dab1dad
YO
1327 DEBUG_OPTIMISE_r({
1328 PerlIO_printf( Perl_debug_log,
786e8c11 1329 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1330 (int)depth * 2 + 2, "",
1331 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1332 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1333 (int)depth);
3dab1dad 1334 });
7f69552c
YO
1335
1336 /* Find the node we are going to overwrite */
1337 if ( first == startbranch && OP( last ) != BRANCH ) {
1338 /* whole branch chain */
1339 convert = first;
1340 } else {
1341 /* branch sub-chain */
1342 convert = NEXTOPER( first );
1343 }
1344
a3621e74
YO
1345 /* -- First loop and Setup --
1346
1347 We first traverse the branches and scan each word to determine if it
1348 contains widechars, and how many unique chars there are, this is
1349 important as we have to build a table with at least as many columns as we
1350 have unique chars.
1351
1352 We use an array of integers to represent the character codes 0..255
1353 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1354 native representation of the character value as the key and IV's for the
1355 coded index.
1356
1357 *TODO* If we keep track of how many times each character is used we can
1358 remap the columns so that the table compression later on is more
1359 efficient in terms of memory by ensuring most common value is in the
1360 middle and the least common are on the outside. IMO this would be better
1361 than a most to least common mapping as theres a decent chance the most
1362 common letter will share a node with the least common, meaning the node
1363 will not be compressable. With a middle is most common approach the worst
1364 case is when we have the least common nodes twice.
1365
1366 */
1367
a3621e74 1368 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1369 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1370 const U8 *uc = (U8*)STRING( noper );
a28509cc 1371 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1372 STRLEN foldlen = 0;
1373 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1374 const U8 *scan = (U8*)NULL;
07be1b83 1375 U32 wordlen = 0; /* required init */
3dab1dad 1376 STRLEN chars=0;
a3621e74 1377
3dab1dad
YO
1378 if (OP(noper) == NOTHING) {
1379 trie->minlen= 0;
1380 continue;
1381 }
1382 if (trie->bitmap) {
1383 TRIE_BITMAP_SET(trie,*uc);
1384 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1385 }
a3621e74 1386 for ( ; uc < e ; uc += len ) {
3dab1dad 1387 TRIE_CHARCOUNT(trie)++;
a3621e74 1388 TRIE_READ_CHAR;
3dab1dad 1389 chars++;
a3621e74
YO
1390 if ( uvc < 256 ) {
1391 if ( !trie->charmap[ uvc ] ) {
1392 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1393 if ( folder )
1394 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1395 TRIE_STORE_REVCHAR;
a3621e74
YO
1396 }
1397 } else {
1398 SV** svpp;
55eed653
NC
1399 if ( !widecharmap )
1400 widecharmap = newHV();
a3621e74 1401
55eed653 1402 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1403
1404 if ( !svpp )
e4584336 1405 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1406
1407 if ( !SvTRUE( *svpp ) ) {
1408 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1409 TRIE_STORE_REVCHAR;
a3621e74
YO
1410 }
1411 }
1412 }
3dab1dad
YO
1413 if( cur == first ) {
1414 trie->minlen=chars;
1415 trie->maxlen=chars;
1416 } else if (chars < trie->minlen) {
1417 trie->minlen=chars;
1418 } else if (chars > trie->maxlen) {
1419 trie->maxlen=chars;
1420 }
1421
a3621e74
YO
1422 } /* end first pass */
1423 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1424 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1425 (int)depth * 2 + 2,"",
55eed653 1426 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1427 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1428 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1429 );
c944940b 1430 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1431
1432 /*
1433 We now know what we are dealing with in terms of unique chars and
1434 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1435 representation using a flat table will take. If it's over a reasonable
1436 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1437 conservative but potentially much slower representation using an array
1438 of lists.
1439
1440 At the end we convert both representations into the same compressed
1441 form that will be used in regexec.c for matching with. The latter
1442 is a form that cannot be used to construct with but has memory
1443 properties similar to the list form and access properties similar
1444 to the table form making it both suitable for fast searches and
1445 small enough that its feasable to store for the duration of a program.
1446
1447 See the comment in the code where the compressed table is produced
1448 inplace from the flat tabe representation for an explanation of how
1449 the compression works.
1450
1451 */
1452
1453
3dab1dad 1454 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1455 /*
1456 Second Pass -- Array Of Lists Representation
1457
1458 Each state will be represented by a list of charid:state records
1459 (reg_trie_trans_le) the first such element holds the CUR and LEN
1460 points of the allocated array. (See defines above).
1461
1462 We build the initial structure using the lists, and then convert
1463 it into the compressed table form which allows faster lookups
1464 (but cant be modified once converted).
a3621e74
YO
1465 */
1466
a3621e74
YO
1467 STRLEN transcount = 1;
1468
1e2e3d02
YO
1469 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1470 "%*sCompiling trie using list compiler\n",
1471 (int)depth * 2 + 2, ""));
446bd890 1472
c944940b
JH
1473 trie->states = (reg_trie_state *)
1474 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1475 sizeof(reg_trie_state) );
a3621e74
YO
1476 TRIE_LIST_NEW(1);
1477 next_alloc = 2;
1478
1479 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1480
c445ea15
AL
1481 regnode * const noper = NEXTOPER( cur );
1482 U8 *uc = (U8*)STRING( noper );
1483 const U8 * const e = uc + STR_LEN( noper );
1484 U32 state = 1; /* required init */
1485 U16 charid = 0; /* sanity init */
1486 U8 *scan = (U8*)NULL; /* sanity init */
1487 STRLEN foldlen = 0; /* required init */
07be1b83 1488 U32 wordlen = 0; /* required init */
c445ea15
AL
1489 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1490
3dab1dad 1491 if (OP(noper) != NOTHING) {
786e8c11 1492 for ( ; uc < e ; uc += len ) {
c445ea15 1493
786e8c11 1494 TRIE_READ_CHAR;
c445ea15 1495
786e8c11
YO
1496 if ( uvc < 256 ) {
1497 charid = trie->charmap[ uvc ];
c445ea15 1498 } else {
55eed653 1499 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1500 if ( !svpp ) {
1501 charid = 0;
1502 } else {
1503 charid=(U16)SvIV( *svpp );
1504 }
c445ea15 1505 }
786e8c11
YO
1506 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1507 if ( charid ) {
a3621e74 1508
786e8c11
YO
1509 U16 check;
1510 U32 newstate = 0;
a3621e74 1511
786e8c11
YO
1512 charid--;
1513 if ( !trie->states[ state ].trans.list ) {
1514 TRIE_LIST_NEW( state );
c445ea15 1515 }
786e8c11
YO
1516 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1517 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1518 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1519 break;
1520 }
1521 }
1522 if ( ! newstate ) {
1523 newstate = next_alloc++;
1524 TRIE_LIST_PUSH( state, charid, newstate );
1525 transcount++;
1526 }
1527 state = newstate;
1528 } else {
1529 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1530 }
a28509cc 1531 }
c445ea15 1532 }
3dab1dad 1533 TRIE_HANDLE_WORD(state);
a3621e74
YO
1534
1535 } /* end second pass */
1536
1e2e3d02
YO
1537 /* next alloc is the NEXT state to be allocated */
1538 trie->statecount = next_alloc;
c944940b
JH
1539 trie->states = (reg_trie_state *)
1540 PerlMemShared_realloc( trie->states,
1541 next_alloc
1542 * sizeof(reg_trie_state) );
a3621e74 1543
3dab1dad 1544 /* and now dump it out before we compress it */
2b8b4781
NC
1545 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1546 revcharmap, next_alloc,
1547 depth+1)
1e2e3d02 1548 );
a3621e74 1549
c944940b
JH
1550 trie->trans = (reg_trie_trans *)
1551 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1552 {
1553 U32 state;
a3621e74
YO
1554 U32 tp = 0;
1555 U32 zp = 0;
1556
1557
1558 for( state=1 ; state < next_alloc ; state ++ ) {
1559 U32 base=0;
1560
1561 /*
1562 DEBUG_TRIE_COMPILE_MORE_r(
1563 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1564 );
1565 */
1566
1567 if (trie->states[state].trans.list) {
1568 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1569 U16 maxid=minid;
a28509cc 1570 U16 idx;
a3621e74
YO
1571
1572 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1573 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1574 if ( forid < minid ) {
1575 minid=forid;
1576 } else if ( forid > maxid ) {
1577 maxid=forid;
1578 }
a3621e74
YO
1579 }
1580 if ( transcount < tp + maxid - minid + 1) {
1581 transcount *= 2;
c944940b
JH
1582 trie->trans = (reg_trie_trans *)
1583 PerlMemShared_realloc( trie->trans,
446bd890
NC
1584 transcount
1585 * sizeof(reg_trie_trans) );
a3621e74
YO
1586 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1587 }
1588 base = trie->uniquecharcount + tp - minid;
1589 if ( maxid == minid ) {
1590 U32 set = 0;
1591 for ( ; zp < tp ; zp++ ) {
1592 if ( ! trie->trans[ zp ].next ) {
1593 base = trie->uniquecharcount + zp - minid;
1594 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1595 trie->trans[ zp ].check = state;
1596 set = 1;
1597 break;
1598 }
1599 }
1600 if ( !set ) {
1601 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1602 trie->trans[ tp ].check = state;
1603 tp++;
1604 zp = tp;
1605 }
1606 } else {
1607 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1608 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1609 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1610 trie->trans[ tid ].check = state;
1611 }
1612 tp += ( maxid - minid + 1 );
1613 }
1614 Safefree(trie->states[ state ].trans.list);
1615 }
1616 /*
1617 DEBUG_TRIE_COMPILE_MORE_r(
1618 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1619 );
1620 */
1621 trie->states[ state ].trans.base=base;
1622 }
cc601c31 1623 trie->lasttrans = tp + 1;
a3621e74
YO
1624 }
1625 } else {
1626 /*
1627 Second Pass -- Flat Table Representation.
1628
1629 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1630 We know that we will need Charcount+1 trans at most to store the data
1631 (one row per char at worst case) So we preallocate both structures
1632 assuming worst case.
1633
1634 We then construct the trie using only the .next slots of the entry
1635 structs.
1636
1637 We use the .check field of the first entry of the node temporarily to
1638 make compression both faster and easier by keeping track of how many non
1639 zero fields are in the node.
1640
1641 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1642 transition.
1643
1644 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1645 number representing the first entry of the node, and state as a
1646 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1647 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1648 are 2 entrys per node. eg:
1649
1650 A B A B
1651 1. 2 4 1. 3 7
1652 2. 0 3 3. 0 5
1653 3. 0 0 5. 0 0
1654 4. 0 0 7. 0 0
1655
1656 The table is internally in the right hand, idx form. However as we also
1657 have to deal with the states array which is indexed by nodenum we have to
1658 use TRIE_NODENUM() to convert.
1659
1660 */
1e2e3d02
YO
1661 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1662 "%*sCompiling trie using table compiler\n",
1663 (int)depth * 2 + 2, ""));
3dab1dad 1664
c944940b
JH
1665 trie->trans = (reg_trie_trans *)
1666 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1667 * trie->uniquecharcount + 1,
1668 sizeof(reg_trie_trans) );
1669 trie->states = (reg_trie_state *)
1670 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1671 sizeof(reg_trie_state) );
a3621e74
YO
1672 next_alloc = trie->uniquecharcount + 1;
1673
3dab1dad 1674
a3621e74
YO
1675 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1676
c445ea15 1677 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1678 const U8 *uc = (U8*)STRING( noper );
1679 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1680
1681 U32 state = 1; /* required init */
1682
1683 U16 charid = 0; /* sanity init */
1684 U32 accept_state = 0; /* sanity init */
1685 U8 *scan = (U8*)NULL; /* sanity init */
1686
1687 STRLEN foldlen = 0; /* required init */
07be1b83 1688 U32 wordlen = 0; /* required init */
a3621e74
YO
1689 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1690
3dab1dad 1691 if ( OP(noper) != NOTHING ) {
786e8c11 1692 for ( ; uc < e ; uc += len ) {
a3621e74 1693
786e8c11 1694 TRIE_READ_CHAR;
a3621e74 1695
786e8c11
YO
1696 if ( uvc < 256 ) {
1697 charid = trie->charmap[ uvc ];
1698 } else {
55eed653 1699 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1700 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1701 }
786e8c11
YO
1702 if ( charid ) {
1703 charid--;
1704 if ( !trie->trans[ state + charid ].next ) {
1705 trie->trans[ state + charid ].next = next_alloc;
1706 trie->trans[ state ].check++;
1707 next_alloc += trie->uniquecharcount;
1708 }
1709 state = trie->trans[ state + charid ].next;
1710 } else {
1711 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1712 }
1713 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1714 }
a3621e74 1715 }
3dab1dad
YO
1716 accept_state = TRIE_NODENUM( state );
1717 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1718
1719 } /* end second pass */
1720
3dab1dad 1721 /* and now dump it out before we compress it */
2b8b4781
NC
1722 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1723 revcharmap,
1724 next_alloc, depth+1));
a3621e74 1725
a3621e74
YO
1726 {
1727 /*
1728 * Inplace compress the table.*
1729
1730 For sparse data sets the table constructed by the trie algorithm will
1731 be mostly 0/FAIL transitions or to put it another way mostly empty.
1732 (Note that leaf nodes will not contain any transitions.)
1733
1734 This algorithm compresses the tables by eliminating most such
1735 transitions, at the cost of a modest bit of extra work during lookup:
1736
1737 - Each states[] entry contains a .base field which indicates the
1738 index in the state[] array wheres its transition data is stored.
1739
1740 - If .base is 0 there are no valid transitions from that node.
1741
1742 - If .base is nonzero then charid is added to it to find an entry in
1743 the trans array.
1744
1745 -If trans[states[state].base+charid].check!=state then the
1746 transition is taken to be a 0/Fail transition. Thus if there are fail
1747 transitions at the front of the node then the .base offset will point
1748 somewhere inside the previous nodes data (or maybe even into a node
1749 even earlier), but the .check field determines if the transition is
1750 valid.
1751
786e8c11 1752 XXX - wrong maybe?
a3621e74
YO
1753 The following process inplace converts the table to the compressed
1754 table: We first do not compress the root node 1,and mark its all its
1755 .check pointers as 1 and set its .base pointer as 1 as well. This
1756 allows to do a DFA construction from the compressed table later, and
1757 ensures that any .base pointers we calculate later are greater than
1758 0.
1759
1760 - We set 'pos' to indicate the first entry of the second node.
1761
1762 - We then iterate over the columns of the node, finding the first and
1763 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1764 and set the .check pointers accordingly, and advance pos
1765 appropriately and repreat for the next node. Note that when we copy
1766 the next pointers we have to convert them from the original
1767 NODEIDX form to NODENUM form as the former is not valid post
1768 compression.
1769
1770 - If a node has no transitions used we mark its base as 0 and do not
1771 advance the pos pointer.
1772
1773 - If a node only has one transition we use a second pointer into the
1774 structure to fill in allocated fail transitions from other states.
1775 This pointer is independent of the main pointer and scans forward
1776 looking for null transitions that are allocated to a state. When it
1777 finds one it writes the single transition into the "hole". If the
786e8c11 1778 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1779
1780 - Once compressed we can Renew/realloc the structures to release the
1781 excess space.
1782
1783 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1784 specifically Fig 3.47 and the associated pseudocode.
1785
1786 demq
1787 */
a3b680e6 1788 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1789 U32 state, charid;
a3621e74 1790 U32 pos = 0, zp=0;
1e2e3d02 1791 trie->statecount = laststate;
a3621e74
YO
1792
1793 for ( state = 1 ; state < laststate ; state++ ) {
1794 U8 flag = 0;
a28509cc
AL
1795 const U32 stateidx = TRIE_NODEIDX( state );
1796 const U32 o_used = trie->trans[ stateidx ].check;
1797 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1798 trie->trans[ stateidx ].check = 0;
1799
1800 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1801 if ( flag || trie->trans[ stateidx + charid ].next ) {
1802 if ( trie->trans[ stateidx + charid ].next ) {
1803 if (o_used == 1) {
1804 for ( ; zp < pos ; zp++ ) {
1805 if ( ! trie->trans[ zp ].next ) {
1806 break;
1807 }
1808 }
1809 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1810 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1811 trie->trans[ zp ].check = state;
1812 if ( ++zp > pos ) pos = zp;
1813 break;
1814 }
1815 used--;
1816 }
1817 if ( !flag ) {
1818 flag = 1;
1819 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1820 }
1821 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1822 trie->trans[ pos ].check = state;
1823 pos++;
1824 }
1825 }
1826 }
cc601c31 1827 trie->lasttrans = pos + 1;
c944940b
JH
1828 trie->states = (reg_trie_state *)
1829 PerlMemShared_realloc( trie->states, laststate
1830 * sizeof(reg_trie_state) );
a3621e74 1831 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1832 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1833 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1834 (int)depth * 2 + 2,"",
1835 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1836 (IV)next_alloc,
1837 (IV)pos,
a3621e74
YO
1838 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1839 );
1840
1841 } /* end table compress */
1842 }
1e2e3d02
YO
1843 DEBUG_TRIE_COMPILE_MORE_r(
1844 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1845 (int)depth * 2 + 2, "",
1846 (UV)trie->statecount,
1847 (UV)trie->lasttrans)
1848 );
cc601c31 1849 /* resize the trans array to remove unused space */
c944940b
JH
1850 trie->trans = (reg_trie_trans *)
1851 PerlMemShared_realloc( trie->trans, trie->lasttrans
1852 * sizeof(reg_trie_trans) );
a3621e74 1853
3dab1dad 1854 /* and now dump out the compressed format */
2b8b4781 1855 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1856
3dab1dad 1857 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1858 U8 nodetype =(U8)(flags & 0xFF);
1859 char *str=NULL;
786e8c11 1860
07be1b83 1861#ifdef DEBUGGING
e62cc96a 1862 regnode *optimize = NULL;
7122b237
YO
1863#ifdef RE_TRACK_PATTERN_OFFSETS
1864
b57a0404
JH
1865 U32 mjd_offset = 0;
1866 U32 mjd_nodelen = 0;
7122b237
YO
1867#endif /* RE_TRACK_PATTERN_OFFSETS */
1868#endif /* DEBUGGING */
a3621e74 1869 /*
3dab1dad
YO
1870 This means we convert either the first branch or the first Exact,
1871 depending on whether the thing following (in 'last') is a branch
1872 or not and whther first is the startbranch (ie is it a sub part of
1873 the alternation or is it the whole thing.)
1874 Assuming its a sub part we conver the EXACT otherwise we convert
1875 the whole branch sequence, including the first.
a3621e74 1876 */
3dab1dad 1877 /* Find the node we are going to overwrite */
7f69552c 1878 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1879 /* branch sub-chain */
3dab1dad 1880 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1881#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1882 DEBUG_r({
1883 mjd_offset= Node_Offset((convert));
1884 mjd_nodelen= Node_Length((convert));
1885 });
7122b237 1886#endif
7f69552c 1887 /* whole branch chain */
7122b237
YO
1888 }
1889#ifdef RE_TRACK_PATTERN_OFFSETS
1890 else {
7f69552c
YO
1891 DEBUG_r({
1892 const regnode *nop = NEXTOPER( convert );
1893 mjd_offset= Node_Offset((nop));
1894 mjd_nodelen= Node_Length((nop));
1895 });
07be1b83
YO
1896 }
1897 DEBUG_OPTIMISE_r(
1898 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1899 (int)depth * 2 + 2, "",
786e8c11 1900 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1901 );
7122b237 1902#endif
3dab1dad
YO
1903 /* But first we check to see if there is a common prefix we can
1904 split out as an EXACT and put in front of the TRIE node. */
1905 trie->startstate= 1;
55eed653 1906 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1907 U32 state;
1e2e3d02 1908 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1909 U32 ofs = 0;
8e11feef
RGS
1910 I32 idx = -1;
1911 U32 count = 0;
1912 const U32 base = trie->states[ state ].trans.base;
a3621e74 1913
3dab1dad 1914 if ( trie->states[state].wordnum )
8e11feef 1915 count = 1;
a3621e74 1916
8e11feef 1917 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1918 if ( ( base + ofs >= trie->uniquecharcount ) &&
1919 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1920 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1921 {
3dab1dad 1922 if ( ++count > 1 ) {
2b8b4781 1923 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1924 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1925 if ( state == 1 ) break;
3dab1dad
YO
1926 if ( count == 2 ) {
1927 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1928 DEBUG_OPTIMISE_r(
8e11feef
RGS
1929 PerlIO_printf(Perl_debug_log,
1930 "%*sNew Start State=%"UVuf" Class: [",
1931 (int)depth * 2 + 2, "",
786e8c11 1932 (UV)state));
be8e71aa 1933 if (idx >= 0) {
2b8b4781 1934 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1935 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1936
3dab1dad 1937 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1938 if ( folder )
1939 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1940 DEBUG_OPTIMISE_r(
07be1b83 1941 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1942 );
8e11feef
RGS
1943 }
1944 }
1945 TRIE_BITMAP_SET(trie,*ch);
1946 if ( folder )
1947 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1948 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1949 }
1950 idx = ofs;
1951 }
3dab1dad
YO
1952 }
1953 if ( count == 1 ) {
2b8b4781 1954 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1955 char *ch = SvPV_nolen( *tmp );
1956 DEBUG_OPTIMISE_r({
1957 SV *sv=sv_newmortal();
8e11feef
RGS
1958 PerlIO_printf( Perl_debug_log,
1959 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1960 (int)depth * 2 + 2, "",
de734bd5
A
1961 (UV)state, (UV)idx,
1962 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1963 PL_colors[0], PL_colors[1],
1964 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1965 PERL_PV_ESCAPE_FIRSTCHAR
1966 )
1967 );
1968 });
3dab1dad
YO
1969 if ( state==1 ) {
1970 OP( convert ) = nodetype;
1971 str=STRING(convert);
1972 STR_LEN(convert)=0;
1973 }
de734bd5
A
1974 while (*ch) {
1975 *str++ = *ch++;
1976 STR_LEN(convert)++;
1977 }
1978
8e11feef 1979 } else {
f9049ba1 1980#ifdef DEBUGGING
8e11feef
RGS
1981 if (state>1)
1982 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1983#endif
8e11feef
RGS
1984 break;
1985 }
1986 }
3dab1dad 1987 if (str) {
8e11feef 1988 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1989 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1990 trie->startstate = state;
07be1b83
YO
1991 trie->minlen -= (state - 1);
1992 trie->maxlen -= (state - 1);
1993 DEBUG_r({
1994 regnode *fix = convert;
de734bd5 1995 U32 word = trie->wordcount;
07be1b83
YO
1996 mjd_nodelen++;
1997 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1998 while( ++fix < n ) {
1999 Set_Node_Offset_Length(fix, 0, 0);
2000 }
de734bd5 2001 while (word--) {
2b8b4781 2002 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
2003 if (tmp) {
2004 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2005 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2006 else
2007 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2008 }
2009 }
07be1b83 2010 });
8e11feef
RGS
2011 if (trie->maxlen) {
2012 convert = n;
2013 } else {
3dab1dad 2014 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2015 DEBUG_r(optimize= n);
3dab1dad
YO
2016 }
2017 }
2018 }
a5ca303d
YO
2019 if (!jumper)
2020 jumper = last;
3dab1dad 2021 if ( trie->maxlen ) {
8e11feef
RGS
2022 NEXT_OFF( convert ) = (U16)(tail - convert);
2023 ARG_SET( convert, data_slot );
786e8c11
YO
2024 /* Store the offset to the first unabsorbed branch in
2025 jump[0], which is otherwise unused by the jump logic.
2026 We use this when dumping a trie and during optimisation. */
2027 if (trie->jump)
7f69552c 2028 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2029
786e8c11
YO
2030 /* XXXX */
2031 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2032 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2033 {
2034 OP( convert ) = TRIEC;
2035 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2036 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2037 trie->bitmap= NULL;
2038 } else
2039 OP( convert ) = TRIE;
a3621e74 2040
3dab1dad
YO
2041 /* store the type in the flags */
2042 convert->flags = nodetype;
a5ca303d
YO
2043 DEBUG_r({
2044 optimize = convert
2045 + NODE_STEP_REGNODE
2046 + regarglen[ OP( convert ) ];
2047 });
2048 /* XXX We really should free up the resource in trie now,
2049 as we won't use them - (which resources?) dmq */
3dab1dad 2050 }
a3621e74 2051 /* needed for dumping*/
e62cc96a 2052 DEBUG_r(if (optimize) {
07be1b83 2053 regnode *opt = convert;
bcdf7404 2054
e62cc96a 2055 while ( ++opt < optimize) {
07be1b83
YO
2056 Set_Node_Offset_Length(opt,0,0);
2057 }
786e8c11
YO
2058 /*
2059 Try to clean up some of the debris left after the
2060 optimisation.
a3621e74 2061 */
786e8c11 2062 while( optimize < jumper ) {
07be1b83 2063 mjd_nodelen += Node_Length((optimize));
a3621e74 2064 OP( optimize ) = OPTIMIZED;
07be1b83 2065 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2066 optimize++;
2067 }
07be1b83 2068 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2069 });
2070 } /* end node insert */
55eed653 2071 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2072#ifdef DEBUGGING
2073 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2074 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2075#else
2076 SvREFCNT_dec(revcharmap);
07be1b83 2077#endif
786e8c11
YO
2078 return trie->jump
2079 ? MADE_JUMP_TRIE
2080 : trie->startstate>1
2081 ? MADE_EXACT_TRIE
2082 : MADE_TRIE;
2083}
2084
2085STATIC void
2086S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2087{
2088/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2089
2090 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2091 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2092 ISBN 0-201-10088-6
2093
2094 We find the fail state for each state in the trie, this state is the longest proper
2095 suffix of the current states 'word' that is also a proper prefix of another word in our
2096 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2097 the DFA not to have to restart after its tried and failed a word at a given point, it
2098 simply continues as though it had been matching the other word in the first place.
2099 Consider
2100 'abcdgu'=~/abcdefg|cdgu/
2101 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2102 fail, which would bring use to the state representing 'd' in the second word where we would
2103 try 'g' and succeed, prodceding to match 'cdgu'.
2104 */
2105 /* add a fail transition */
3251b653
NC
2106 const U32 trie_offset = ARG(source);
2107 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2108 U32 *q;
2109 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2110 const U32 numstates = trie->statecount;
786e8c11
YO
2111 const U32 ubound = trie->lasttrans + ucharcount;
2112 U32 q_read = 0;
2113 U32 q_write = 0;
2114 U32 charid;
2115 U32 base = trie->states[ 1 ].trans.base;
2116 U32 *fail;
2117 reg_ac_data *aho;
2118 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2119 GET_RE_DEBUG_FLAGS_DECL;
2120#ifndef DEBUGGING
2121 PERL_UNUSED_ARG(depth);
2122#endif
2123
2124
2125 ARG_SET( stclass, data_slot );
c944940b 2126 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2127 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2128 aho->trie=trie_offset;
446bd890
NC
2129 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2130 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2131 Newxz( q, numstates, U32);
c944940b 2132 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2133 aho->refcount = 1;
2134 fail = aho->fail;
2135 /* initialize fail[0..1] to be 1 so that we always have
2136 a valid final fail state */
2137 fail[ 0 ] = fail[ 1 ] = 1;
2138
2139 for ( charid = 0; charid < ucharcount ; charid++ ) {
2140 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2141 if ( newstate ) {
2142 q[ q_write ] = newstate;
2143 /* set to point at the root */
2144 fail[ q[ q_write++ ] ]=1;
2145 }
2146 }
2147 while ( q_read < q_write) {
2148 const U32 cur = q[ q_read++ % numstates ];
2149 base = trie->states[ cur ].trans.base;
2150
2151 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2152 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2153 if (ch_state) {
2154 U32 fail_state = cur;
2155 U32 fail_base;
2156 do {
2157 fail_state = fail[ fail_state ];
2158 fail_base = aho->states[ fail_state ].trans.base;
2159 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2160
2161 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2162 fail[ ch_state ] = fail_state;
2163 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2164 {
2165 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2166 }
2167 q[ q_write++ % numstates] = ch_state;
2168 }
2169 }
2170 }
2171 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2172 when we fail in state 1, this allows us to use the
2173 charclass scan to find a valid start char. This is based on the principle
2174 that theres a good chance the string being searched contains lots of stuff
2175 that cant be a start char.
2176 */
2177 fail[ 0 ] = fail[ 1 ] = 0;
2178 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2179 PerlIO_printf(Perl_debug_log,
2180 "%*sStclass Failtable (%"UVuf" states): 0",
2181 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2182 );
786e8c11
YO
2183 for( q_read=1; q_read<numstates; q_read++ ) {
2184 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2185 }
2186 PerlIO_printf(Perl_debug_log, "\n");
2187 });
2188 Safefree(q);
2189 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2190}
2191
786e8c11 2192
a3621e74 2193/*
5d1c421c
JH
2194 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2195 * These need to be revisited when a newer toolchain becomes available.
2196 */
2197#if defined(__sparc64__) && defined(__GNUC__)
2198# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2199# undef SPARC64_GCC_WORKAROUND
2200# define SPARC64_GCC_WORKAROUND 1
2201# endif
2202#endif
2203
07be1b83 2204#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2205 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2206 SV * const mysv=sv_newmortal(); \
2207 regnode *Next = regnext(scan); \
2208 regprop(RExC_rx, mysv, scan); \
7f69552c 2209 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2210 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2211 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2212 }});
07be1b83 2213
1de06328
YO
2214
2215
2216
2217
07be1b83
YO
2218#define JOIN_EXACT(scan,min,flags) \
2219 if (PL_regkind[OP(scan)] == EXACT) \
2220 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2221
be8e71aa 2222STATIC U32
07be1b83
YO
2223S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2224 /* Merge several consecutive EXACTish nodes into one. */
2225 regnode *n = regnext(scan);
2226 U32 stringok = 1;
2227 regnode *next = scan + NODE_SZ_STR(scan);
2228 U32 merged = 0;
2229 U32 stopnow = 0;
2230#ifdef DEBUGGING
2231 regnode *stop = scan;
72f13be8 2232 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2233#else
d47053eb
RGS
2234 PERL_UNUSED_ARG(depth);
2235#endif
2236#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2237 PERL_UNUSED_ARG(flags);
2238 PERL_UNUSED_ARG(val);
07be1b83 2239#endif
07be1b83
YO
2240 DEBUG_PEEP("join",scan,depth);
2241
2242 /* Skip NOTHING, merge EXACT*. */
2243 while (n &&
2244 ( PL_regkind[OP(n)] == NOTHING ||
2245 (stringok && (OP(n) == OP(scan))))
2246 && NEXT_OFF(n)
2247 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2248
2249 if (OP(n) == TAIL || n > next)
2250 stringok = 0;
2251 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2252 DEBUG_PEEP("skip:",n,depth);
2253 NEXT_OFF(scan) += NEXT_OFF(n);
2254 next = n + NODE_STEP_REGNODE;
2255#ifdef DEBUGGING
2256 if (stringok)
2257 stop = n;
2258#endif
2259 n = regnext(n);
2260 }
2261 else if (stringok) {
786e8c11 2262 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2263 regnode * const nnext = regnext(n);
2264
2265 DEBUG_PEEP("merg",n,depth);
2266
2267 merged++;
2268 if (oldl + STR_LEN(n) > U8_MAX)
2269 break;
2270 NEXT_OFF(scan) += NEXT_OFF(n);
2271 STR_LEN(scan) += STR_LEN(n);
2272 next = n + NODE_SZ_STR(n);
2273 /* Now we can overwrite *n : */
2274 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2275#ifdef DEBUGGING
2276 stop = next - 1;
2277#endif
2278 n = nnext;
2279 if (stopnow) break;
2280 }
2281
d47053eb
RGS
2282#ifdef EXPERIMENTAL_INPLACESCAN
2283 if (flags && !NEXT_OFF(n)) {
2284 DEBUG_PEEP("atch", val, depth);
2285 if (reg_off_by_arg[OP(n)]) {
2286 ARG_SET(n, val - n);
2287 }
2288 else {
2289 NEXT_OFF(n) = val - n;
2290 }
2291 stopnow = 1;
2292 }
07be1b83
YO
2293#endif
2294 }
2295
2296 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2297 /*
2298 Two problematic code points in Unicode casefolding of EXACT nodes:
2299
2300 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2301 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2302
2303 which casefold to
2304
2305 Unicode UTF-8
2306
2307 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2308 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2309
2310 This means that in case-insensitive matching (or "loose matching",
2311 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2312 length of the above casefolded versions) can match a target string
2313 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2314 This would rather mess up the minimum length computation.
2315
2316 What we'll do is to look for the tail four bytes, and then peek
2317 at the preceding two bytes to see whether we need to decrease
2318 the minimum length by four (six minus two).
2319
2320 Thanks to the design of UTF-8, there cannot be false matches:
2321 A sequence of valid UTF-8 bytes cannot be a subsequence of
2322 another valid sequence of UTF-8 bytes.
2323
2324 */
2325 char * const s0 = STRING(scan), *s, *t;
2326 char * const s1 = s0 + STR_LEN(scan) - 1;
2327 char * const s2 = s1 - 4;
e294cc5d
JH
2328#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2329 const char t0[] = "\xaf\x49\xaf\x42";
2330#else
07be1b83 2331 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2332#endif
07be1b83
YO
2333 const char * const t1 = t0 + 3;
2334
2335 for (s = s0 + 2;
2336 s < s2 && (t = ninstr(s, s1, t0, t1));
2337 s = t + 4) {
e294cc5d
JH
2338#ifdef EBCDIC
2339 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2340 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2341#else
07be1b83
YO
2342 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2343 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2344#endif
07be1b83
YO
2345 *min -= 4;
2346 }
2347 }
2348
2349#ifdef DEBUGGING
2350 /* Allow dumping */
2351 n = scan + NODE_SZ_STR(scan);
2352 while (n <= stop) {
2353 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2354 OP(n) = OPTIMIZED;
2355 NEXT_OFF(n) = 0;
2356 }
2357 n++;
2358 }
2359#endif
2360 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2361 return stopnow;
2362}
2363
653099ff
GS
2364/* REx optimizer. Converts nodes into quickier variants "in place".
2365 Finds fixed substrings. */
2366
a0288114 2367/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2368 to the position after last scanned or to NULL. */
2369
40d049e4
YO
2370#define INIT_AND_WITHP \
2371 assert(!and_withp); \
2372 Newx(and_withp,1,struct regnode_charclass_class); \
2373 SAVEFREEPV(and_withp)
07be1b83 2374
b515a41d
YO
2375/* this is a chain of data about sub patterns we are processing that
2376 need to be handled seperately/specially in study_chunk. Its so
2377 we can simulate recursion without losing state. */
2378struct scan_frame;
2379typedef struct scan_frame {
2380 regnode *last; /* last node to process in this frame */
2381 regnode *next; /* next node to process when last is reached */
2382 struct scan_frame *prev; /*previous frame*/
2383 I32 stop; /* what stopparen do we use */
2384} scan_frame;
2385
304ee84b
YO
2386
2387#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2388
76e3520e 2389STATIC I32
40d049e4 2390S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2391 I32 *minlenp, I32 *deltap,
40d049e4
YO
2392 regnode *last,
2393 scan_data_t *data,
2394 I32 stopparen,
2395 U8* recursed,
2396 struct regnode_charclass_class *and_withp,
2397 U32 flags, U32 depth)
c277df42
IZ
2398 /* scanp: Start here (read-write). */
2399 /* deltap: Write maxlen-minlen here. */
2400 /* last: Stop before this one. */
40d049e4
YO
2401 /* data: string data about the pattern */
2402 /* stopparen: treat close N as END */
2403 /* recursed: which subroutines have we recursed into */
2404 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2405{
97aff369 2406 dVAR;
c277df42
IZ
2407 I32 min = 0, pars = 0, code;
2408 regnode *scan = *scanp, *next;
2409 I32 delta = 0;
2410 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2411 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2412 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2413 scan_data_t data_fake;
a3621e74 2414 SV *re_trie_maxbuff = NULL;
786e8c11 2415 regnode *first_non_open = scan;
e2e6a0f1 2416 I32 stopmin = I32_MAX;
8aa23a47
YO
2417 scan_frame *frame = NULL;
2418
a3621e74 2419 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2420
13a24bad 2421#ifdef DEBUGGING
40d049e4 2422 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2423#endif
40d049e4 2424
786e8c11 2425 if ( depth == 0 ) {
40d049e4 2426 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2427 first_non_open=regnext(first_non_open);
2428 }
2429
b81d288d 2430
8aa23a47
YO
2431 fake_study_recurse:
2432 while ( scan && OP(scan) != END && scan < last ){
2433 /* Peephole optimizer: */
304ee84b 2434 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2435 DEBUG_PEEP("Peep",scan,depth);
2436 JOIN_EXACT(scan,&min,0);
2437
2438 /* Follow the next-chain of the current node and optimize
2439 away all the NOTHINGs from it. */
2440 if (OP(scan) != CURLYX) {
2441 const int max = (reg_off_by_arg[OP(scan)]
2442 ? I32_MAX
2443 /* I32 may be smaller than U16 on CRAYs! */
2444 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2445 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2446 int noff;
2447 regnode *n = scan;
2448
2449 /* Skip NOTHING and LONGJMP. */
2450 while ((n = regnext(n))
2451 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2452 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2453 && off + noff < max)
2454 off += noff;
2455 if (reg_off_by_arg[OP(scan)])
2456 ARG(scan) = off;
2457 else
2458 NEXT_OFF(scan) = off;
2459 }
a3621e74 2460
c277df42 2461
8aa23a47
YO
2462
2463 /* The principal pseudo-switch. Cannot be a switch, since we
2464 look into several different things. */
2465 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2466 || OP(scan) == IFTHEN) {
2467 next = regnext(scan);
2468 code = OP(scan);
2469 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2470
2471 if (OP(next) == code || code == IFTHEN) {
2472 /* NOTE - There is similar code to this block below for handling
2473 TRIE nodes on a re-study. If you change stuff here check there
2474 too. */
2475 I32 max1 = 0, min1 = I32_MAX, num = 0;
2476 struct regnode_charclass_class accum;
2477 regnode * const startbranch=scan;
2478
2479 if (flags & SCF_DO_SUBSTR)
304ee84b 2480 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2481 if (flags & SCF_DO_STCLASS)
2482 cl_init_zero(pRExC_state, &accum);
2483
2484 while (OP(scan) == code) {
2485 I32 deltanext, minnext, f = 0, fake;
2486 struct regnode_charclass_class this_class;
2487
2488 num++;
2489 data_fake.flags = 0;
2490 if (data) {
2491 data_fake.whilem_c = data->whilem_c;
2492 data_fake.last_closep = data->last_closep;
2493 }
2494 else
2495 data_fake.last_closep = &fake;
58e23c8d
YO
2496
2497 data_fake.pos_delta = delta;
8aa23a47
YO
2498 next = regnext(scan);
2499 scan = NEXTOPER(scan);
2500 if (code != BRANCH)
c277df42 2501 scan = NEXTOPER(scan);
8aa23a47
YO
2502 if (flags & SCF_DO_STCLASS) {
2503 cl_init(pRExC_state, &this_class);
2504 data_fake.start_class = &this_class;
2505 f = SCF_DO_STCLASS_AND;
58e23c8d 2506 }
8aa23a47
YO
2507 if (flags & SCF_WHILEM_VISITED_POS)
2508 f |= SCF_WHILEM_VISITED_POS;
2509
2510 /* we suppose the run is continuous, last=next...*/
2511 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2512 next, &data_fake,
2513 stopparen, recursed, NULL, f,depth+1);
2514 if (min1 > minnext)
2515 min1 = minnext;
2516 if (max1 < minnext + deltanext)
2517 max1 = minnext + deltanext;
2518 if (deltanext == I32_MAX)
2519 is_inf = is_inf_internal = 1;
2520 scan = next;
2521 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2522 pars++;
2523 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2524 if ( stopmin > minnext)
2525 stopmin = min + min1;
2526 flags &= ~SCF_DO_SUBSTR;
2527 if (data)
2528 data->flags |= SCF_SEEN_ACCEPT;
2529 }
2530 if (data) {
2531 if (data_fake.flags & SF_HAS_EVAL)
2532 data->flags |= SF_HAS_EVAL;
2533 data->whilem_c = data_fake.whilem_c;
3dab1dad 2534 }
8aa23a47
YO
2535 if (flags & SCF_DO_STCLASS)
2536 cl_or(pRExC_state, &accum, &this_class);
2537 }
2538 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2539 min1 = 0;
2540 if (flags & SCF_DO_SUBSTR) {
2541 data->pos_min += min1;
2542 data->pos_delta += max1 - min1;
2543 if (max1 != min1 || is_inf)
2544 data->longest = &(data->longest_float);
2545 }
2546 min += min1;
2547 delta += max1 - min1;
2548 if (flags & SCF_DO_STCLASS_OR) {
2549 cl_or(pRExC_state, data->start_class, &accum);
2550 if (min1) {
2551 cl_and(data->start_class, and_withp);
2552 flags &= ~SCF_DO_STCLASS;
653099ff 2553 }
8aa23a47
YO
2554 }
2555 else if (flags & SCF_DO_STCLASS_AND) {
2556 if (min1) {
2557 cl_and(data->start_class, &accum);
2558 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2559 }
8aa23a47
YO
2560 else {
2561 /* Switch to OR mode: cache the old value of
2562 * data->start_class */
2563 INIT_AND_WITHP;
2564 StructCopy(data->start_class, and_withp,
2565 struct regnode_charclass_class);
2566 flags &= ~SCF_DO_STCLASS_AND;
2567 StructCopy(&accum, data->start_class,
2568 struct regnode_charclass_class);
2569 flags |= SCF_DO_STCLASS_OR;
2570 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2571 }
8aa23a47 2572 }
a3621e74 2573
8aa23a47
YO
2574 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2575 /* demq.
a3621e74 2576
8aa23a47
YO
2577 Assuming this was/is a branch we are dealing with: 'scan' now
2578 points at the item that follows the branch sequence, whatever
2579 it is. We now start at the beginning of the sequence and look
2580 for subsequences of
a3621e74 2581
8aa23a47
YO
2582 BRANCH->EXACT=>x1
2583 BRANCH->EXACT=>x2
2584 tail
a3621e74 2585
8aa23a47 2586 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2587
8aa23a47
YO
2588 If we can find such a subseqence we need to turn the first
2589 element into a trie and then add the subsequent branch exact
2590 strings to the trie.
a3621e74 2591
8aa23a47 2592 We have two cases
a3621e74 2593
8aa23a47 2594 1. patterns where the whole set of branch can be converted.
a3621e74 2595
8aa23a47 2596 2. patterns where only a subset can be converted.
a3621e74 2597
8aa23a47
YO
2598 In case 1 we can replace the whole set with a single regop
2599 for the trie. In case 2 we need to keep the start and end
2600 branchs so
a3621e74 2601
8aa23a47
YO
2602 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2603 becomes BRANCH TRIE; BRANCH X;
786e8c11 2604
8aa23a47
YO
2605 There is an additional case, that being where there is a
2606 common prefix, which gets split out into an EXACT like node
2607 preceding the TRIE node.
a3621e74 2608
8aa23a47
YO
2609 If x(1..n)==tail then we can do a simple trie, if not we make
2610 a "jump" trie, such that when we match the appropriate word
2611 we "jump" to the appopriate tail node. Essentailly we turn
2612 a nested if into a case structure of sorts.
b515a41d 2613
8aa23a47
YO
2614 */
2615
2616 int made=0;
2617 if (!re_trie_maxbuff) {
2618 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2619 if (!SvIOK(re_trie_maxbuff))
2620 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2621 }
2622 if ( SvIV(re_trie_maxbuff)>=0 ) {
2623 regnode *cur;
2624 regnode *first = (regnode *)NULL;
2625 regnode *last = (regnode *)NULL;
2626 regnode *tail = scan;
2627 U8 optype = 0;
2628 U32 count=0;
a3621e74
YO
2629
2630#ifdef DEBUGGING
8aa23a47 2631 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2632#endif
8aa23a47
YO
2633 /* var tail is used because there may be a TAIL
2634 regop in the way. Ie, the exacts will point to the
2635 thing following the TAIL, but the last branch will
2636 point at the TAIL. So we advance tail. If we
2637 have nested (?:) we may have to move through several
2638 tails.
2639 */
2640
2641 while ( OP( tail ) == TAIL ) {
2642 /* this is the TAIL generated by (?:) */
2643 tail = regnext( tail );
2644 }
a3621e74 2645
8aa23a47
YO
2646
2647 DEBUG_OPTIMISE_r({
2648 regprop(RExC_rx, mysv, tail );
2649 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2650 (int)depth * 2 + 2, "",
2651 "Looking for TRIE'able sequences. Tail node is: ",
2652 SvPV_nolen_const( mysv )
2653 );
2654 });
2655
2656 /*
2657
2658 step through the branches, cur represents each
2659 branch, noper is the first thing to be matched
2660 as part of that branch and noper_next is the
2661 regnext() of that node. if noper is an EXACT
2662 and noper_next is the same as scan (our current
2663 position in the regex) then the EXACT branch is
2664 a possible optimization target. Once we have
2665 two or more consequetive such branches we can
2666 create a trie of the EXACT's contents and stich
2667 it in place. If the sequence represents all of
2668 the branches we eliminate the whole thing and
2669 replace it with a single TRIE. If it is a
2670 subsequence then we need to stitch it in. This
2671 means the first branch has to remain, and needs
2672 to be repointed at the item on the branch chain
2673 following the last branch optimized. This could
2674 be either a BRANCH, in which case the
2675 subsequence is internal, or it could be the
2676 item following the branch sequence in which
2677 case the subsequence is at the end.
2678
2679 */
2680
2681 /* dont use tail as the end marker for this traverse */
2682 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2683 regnode * const noper = NEXTOPER( cur );
b515a41d 2684#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2685 regnode * const noper_next = regnext( noper );
b515a41d
YO
2686#endif
2687
8aa23a47
YO
2688 DEBUG_OPTIMISE_r({
2689 regprop(RExC_rx, mysv, cur);
2690 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2691 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2692
2693 regprop(RExC_rx, mysv, noper);
2694 PerlIO_printf( Perl_debug_log, " -> %s",
2695 SvPV_nolen_const(mysv));
2696
2697 if ( noper_next ) {
2698 regprop(RExC_rx, mysv, noper_next );
2699 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2700 SvPV_nolen_const(mysv));
2701 }
2702 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2703 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2704 });
2705 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2706 : PL_regkind[ OP( noper ) ] == EXACT )
2707 || OP(noper) == NOTHING )
786e8c11 2708#ifdef NOJUMPTRIE
8aa23a47 2709 && noper_next == tail
786e8c11 2710#endif
8aa23a47
YO
2711 && count < U16_MAX)
2712 {
2713 count++;
2714 if ( !first || optype == NOTHING ) {
2715 if (!first) first = cur;
2716 optype = OP( noper );
2717 } else {
2718 last = cur;
2719 }
2720 } else {
2721 if ( last ) {
2722 make_trie( pRExC_state,
2723 startbranch, first, cur, tail, count,
2724 optype, depth+1 );
2725 }
2726 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2727#ifdef NOJUMPTRIE
8aa23a47 2728 && noper_next == tail
786e8c11 2729#endif
8aa23a47
YO
2730 ){
2731 count = 1;
2732 first = cur;
2733 optype = OP( noper );
2734 } else {
2735 count = 0;
2736 first = NULL;
2737 optype = 0;
2738 }
2739 last = NULL;
2740 }
2741 }
2742 DEBUG_OPTIMISE_r({
2743 regprop(RExC_rx, mysv, cur);
2744 PerlIO_printf( Perl_debug_log,
2745 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2746 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2747
2748 });
2749 if ( last ) {
2750 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2751#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2752 if ( ((made == MADE_EXACT_TRIE &&
2753 startbranch == first)
2754 || ( first_non_open == first )) &&
2755 depth==0 ) {
2756 flags |= SCF_TRIE_RESTUDY;
2757 if ( startbranch == first
2758 && scan == tail )
2759 {
2760 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2761 }
2762 }
3dab1dad 2763#endif
8aa23a47
YO
2764 }
2765 }
2766
2767 } /* do trie */
2768
653099ff 2769 }
8aa23a47
YO
2770 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2771 scan = NEXTOPER(NEXTOPER(scan));
2772 } else /* single branch is optimized. */
2773 scan = NEXTOPER(scan);
2774 continue;
2775 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2776 scan_frame *newframe = NULL;
2777 I32 paren;
2778 regnode *start;
2779 regnode *end;
2780
2781 if (OP(scan) != SUSPEND) {
2782 /* set the pointer */
2783 if (OP(scan) == GOSUB) {
2784 paren = ARG(scan);
2785 RExC_recurse[ARG2L(scan)] = scan;
2786 start = RExC_open_parens[paren-1];
2787 end = RExC_close_parens[paren-1];
2788 } else {
2789 paren = 0;
f8fc2ecf 2790 start = RExC_rxi->program + 1;
8aa23a47
YO
2791 end = RExC_opend;
2792 }
2793 if (!recursed) {
2794 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2795 SAVEFREEPV(recursed);
2796 }
2797 if (!PAREN_TEST(recursed,paren+1)) {
2798 PAREN_SET(recursed,paren+1);
2799 Newx(newframe,1,scan_frame);
2800 } else {
2801 if (flags & SCF_DO_SUBSTR) {
304ee84b 2802 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2803 data->longest = &(data->longest_float);
2804 }
2805 is_inf = is_inf_internal = 1;
2806 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2807 cl_anything(pRExC_state, data->start_class);
2808 flags &= ~SCF_DO_STCLASS;
2809 }
2810 } else {
2811 Newx(newframe,1,scan_frame);
2812 paren = stopparen;
2813 start = scan+2;
2814 end = regnext(scan);
2815 }
2816 if (newframe) {
2817 assert(start);
2818 assert(end);
2819 SAVEFREEPV(newframe);
2820 newframe->next = regnext(scan);
2821 newframe->last = last;
2822 newframe->stop = stopparen;
2823 newframe->prev = frame;
2824
2825 frame = newframe;
2826 scan = start;
2827 stopparen = paren;
2828 last = end;
2829
2830 continue;
2831 }
2832 }
2833 else if (OP(scan) == EXACT) {
2834 I32 l = STR_LEN(scan);
2835 UV uc;
2836 if (UTF) {
2837 const U8 * const s = (U8*)STRING(scan);
2838 l = utf8_length(s, s + l);
2839 uc = utf8_to_uvchr(s, NULL);
2840 } else {
2841 uc = *((U8*)STRING(scan));
2842 }
2843 min += l;
2844 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2845 /* The code below prefers earlier match for fixed
2846 offset, later match for variable offset. */
2847 if (data->last_end == -1) { /* Update the start info. */
2848 data->last_start_min = data->pos_min;
2849 data->last_start_max = is_inf
2850 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2851 }
8aa23a47
YO
2852 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2853 if (UTF)
2854 SvUTF8_on(data->last_found);
2855 {
2856 SV * const sv = data->last_found;
2857 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2858 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2859 if (mg && mg->mg_len >= 0)
2860 mg->mg_len += utf8_length((U8*)STRING(scan),
2861 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2862 }
8aa23a47
YO
2863 data->last_end = data->pos_min + l;
2864 data->pos_min += l; /* As in the first entry. */
2865 data->flags &= ~SF_BEFORE_EOL;
2866 }
2867 if (flags & SCF_DO_STCLASS_AND) {
2868 /* Check whether it is compatible with what we know already! */
2869 int compat = 1;
2870
2871 if (uc >= 0x100 ||
2872 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2873 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2874 && (!(data->start_class->flags & ANYOF_FOLD)
2875 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2876 )
2877 compat = 0;
2878 ANYOF_CLASS_ZERO(data->start_class);
2879 ANYOF_BITMAP_ZERO(data->start_class);
2880 if (compat)
2881 ANYOF_BITMAP_SET(data->start_class, uc);
2882 data->start_class->flags &= ~ANYOF_EOS;
2883 if (uc < 0x100)
2884 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2885 }
2886 else if (flags & SCF_DO_STCLASS_OR) {
2887 /* false positive possible if the class is case-folded */
2888 if (uc < 0x100)
2889 ANYOF_BITMAP_SET(data->start_class, uc);
2890 else
2891 data->start_class->flags |= ANYOF_UNICODE_ALL;
2892 data->start_class->flags &= ~ANYOF_EOS;
2893 cl_and(data->start_class, and_withp);
2894 }
2895 flags &= ~SCF_DO_STCLASS;
2896 }
2897 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2898 I32 l = STR_LEN(scan);
2899 UV uc = *((U8*)STRING(scan));
2900
2901 /* Search for fixed substrings supports EXACT only. */
2902 if (flags & SCF_DO_SUBSTR) {
2903 assert(data);
304ee84b 2904 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2905 }
2906 if (UTF) {
2907 const U8 * const s = (U8 *)STRING(scan);
2908 l = utf8_length(s, s + l);
2909 uc = utf8_to_uvchr(s, NULL);
2910 }
2911 min += l;
2912 if (flags & SCF_DO_SUBSTR)
2913 data->pos_min += l;
2914 if (flags & SCF_DO_STCLASS_AND) {
2915 /* Check whether it is compatible with what we know already! */
2916 int compat = 1;
2917
2918 if (uc >= 0x100 ||
2919 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2920 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2921 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2922 compat = 0;
2923 ANYOF_CLASS_ZERO(data->start_class);
2924 ANYOF_BITMAP_ZERO(data->start_class);
2925 if (compat) {
2926 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2927 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2928 data->start_class->flags |= ANYOF_FOLD;
2929 if (OP(scan) == EXACTFL)
2930 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2931 }
8aa23a47
YO
2932 }
2933 else if (flags & SCF_DO_STCLASS_OR) {
2934 if (data->start_class->flags & ANYOF_FOLD) {
2935 /* false positive possible if the class is case-folded.
2936 Assume that the locale settings are the same... */
1aa99e6b
IH
2937 if (uc < 0x100)
2938 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2939 data->start_class->flags &= ~ANYOF_EOS;
2940 }
8aa23a47 2941 cl_and(data->start_class, and_withp);
653099ff 2942 }
8aa23a47
YO
2943 flags &= ~SCF_DO_STCLASS;
2944 }
2945 else if (strchr((const char*)PL_varies,OP(scan))) {
2946 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2947 I32 f = flags, pos_before = 0;
2948 regnode * const oscan = scan;
2949 struct regnode_charclass_class this_class;
2950 struct regnode_charclass_class *oclass = NULL;
2951 I32 next_is_eval = 0;
2952
2953 switch (PL_regkind[OP(scan)]) {
2954 case WHILEM: /* End of (?:...)* . */
2955 scan = NEXTOPER(scan);
2956 goto finish;
2957 case PLUS:
2958 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2959 next = NEXTOPER(scan);
2960 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2961 mincount = 1;
2962 maxcount = REG_INFTY;
2963 next = regnext(scan);
2964 scan = NEXTOPER(scan);
2965 goto do_curly;
2966 }
2967 }
2968 if (flags & SCF_DO_SUBSTR)
2969 data->pos_min++;
2970 min++;
2971 /* Fall through. */
2972 case STAR:
2973 if (flags & SCF_DO_STCLASS) {
2974 mincount = 0;
2975 maxcount = REG_INFTY;
2976 next = regnext(scan);
2977 scan = NEXTOPER(scan);
2978 goto do_curly;
2979 }
2980 is_inf = is_inf_internal = 1;
2981 scan = regnext(scan);
c277df42 2982 if (flags & SCF_DO_SUBSTR) {
304ee84b 2983 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2984 data->longest = &(data->longest_float);
c277df42 2985 }
8aa23a47
YO
2986 goto optimize_curly_tail;
2987 case CURLY:
2988 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2989 && (scan->flags == stopparen))
2990 {
2991 mincount = 1;
2992 maxcount = 1;
2993 } else {
2994 mincount = ARG1(scan);
2995 maxcount = ARG2(scan);
653099ff 2996 }
8aa23a47
YO
2997 next = regnext(scan);
2998 if (OP(scan) == CURLYX) {
2999 I32 lp = (data ? *(data->last_closep) : 0);
3000 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3001 }
8aa23a47
YO
3002 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3003 next_is_eval = (OP(scan) == EVAL);
3004 do_curly:
3005 if (flags & SCF_DO_SUBSTR) {
304ee84b 3006 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3007 pos_before = data->pos_min;
b45f050a 3008 }
8aa23a47
YO
3009 if (data) {
3010 fl = data->flags;
3011 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3012 if (is_inf)
3013 data->flags |= SF_IS_INF;
3014 }
3015 if (flags & SCF_DO_STCLASS) {
3016 cl_init(pRExC_state, &this_class);
3017 oclass = data->start_class;
3018 data->start_class = &this_class;
3019 f |= SCF_DO_STCLASS_AND;
3020 f &= ~SCF_DO_STCLASS_OR;
3021 }
3022 /* These are the cases when once a subexpression
3023 fails at a particular position, it cannot succeed
3024 even after backtracking at the enclosing scope.
3025
3026 XXXX what if minimal match and we are at the
3027 initial run of {n,m}? */
3028 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3029 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3030
8aa23a47
YO
3031 /* This will finish on WHILEM, setting scan, or on NULL: */
3032 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3033 last, data, stopparen, recursed, NULL,
3034 (mincount == 0
3035 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3036
8aa23a47
YO
3037 if (flags & SCF_DO_STCLASS)
3038 data->start_class = oclass;
3039 if (mincount == 0 || minnext == 0) {
3040 if (flags & SCF_DO_STCLASS_OR) {
3041 cl_or(pRExC_state, data->start_class, &this_class);
3042 }
3043 else if (flags & SCF_DO_STCLASS_AND) {
3044 /* Switch to OR mode: cache the old value of
3045 * data->start_class */
3046 INIT_AND_WITHP;
3047 StructCopy(data->start_class, and_withp,
3048 struct regnode_charclass_class);
3049 flags &= ~SCF_DO_STCLASS_AND;
3050 StructCopy(&this_class, data->start_class,
3051 struct regnode_charclass_class);
3052 flags |= SCF_DO_STCLASS_OR;
3053 data->start_class->flags |= ANYOF_EOS;
3054 }
3055 } else { /* Non-zero len */
3056 if (flags & SCF_DO_STCLASS_OR) {
3057 cl_or(pRExC_state, data->start_class, &this_class);
3058 cl_and(data->start_class, and_withp);
3059 }
3060 else if (flags & SCF_DO_STCLASS_AND)
3061 cl_and(data->start_class, &this_class);
3062 flags &= ~SCF_DO_STCLASS;
3063 }
3064 if (!scan) /* It was not CURLYX, but CURLY. */
3065 scan = next;
3066 if ( /* ? quantifier ok, except for (?{ ... }) */
3067 (next_is_eval || !(mincount == 0 && maxcount == 1))
3068 && (minnext == 0) && (deltanext == 0)
3069 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3070 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3071 && ckWARN(WARN_REGEXP))
3072 {
3073 vWARN(RExC_parse,
3074 "Quantifier unexpected on zero-length expression");
3075 }
3076
3077 min += minnext * mincount;
3078 is_inf_internal |= ((maxcount == REG_INFTY
3079 && (minnext + deltanext) > 0)
3080 || deltanext == I32_MAX);
3081 is_inf |= is_inf_internal;
3082 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3083
3084 /* Try powerful optimization CURLYX => CURLYN. */
3085 if ( OP(oscan) == CURLYX && data
3086 && data->flags & SF_IN_PAR
3087 && !(data->flags & SF_HAS_EVAL)
3088 && !deltanext && minnext == 1 ) {
3089 /* Try to optimize to CURLYN. */
3090 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3091 regnode * const nxt1 = nxt;
497b47a8 3092#ifdef DEBUGGING
8aa23a47 3093 regnode *nxt2;
497b47a8 3094#endif
c277df42 3095
8aa23a47
YO
3096 /* Skip open. */
3097 nxt = regnext(nxt);
3098 if (!strchr((const char*)PL_simple,OP(nxt))
3099 && !(PL_regkind[OP(nxt)] == EXACT
3100 && STR_LEN(nxt) == 1))
3101 goto nogo;
497b47a8 3102#ifdef DEBUGGING
8aa23a47 3103 nxt2 = nxt;
497b47a8 3104#endif
8aa23a47
YO
3105 nxt = regnext(nxt);
3106 if (OP(nxt) != CLOSE)
3107 goto nogo;
3108 if (RExC_open_parens) {
3109 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3110 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3111 }
3112 /* Now we know that nxt2 is the only contents: */
3113 oscan->flags = (U8)ARG(nxt);
3114 OP(oscan) = CURLYN;
3115 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3116
c277df42 3117#ifdef DEBUGGING
8aa23a47
YO
3118 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3119 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3120 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3121 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3122 OP(nxt + 1) = OPTIMIZED; /* was count. */
3123 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3124#endif
8aa23a47
YO
3125 }
3126 nogo:
3127
3128 /* Try optimization CURLYX => CURLYM. */
3129 if ( OP(oscan) == CURLYX && data
3130 && !(data->flags & SF_HAS_PAR)
3131 && !(data->flags & SF_HAS_EVAL)
3132 && !deltanext /* atom is fixed width */
3133 && minnext != 0 /* CURLYM can't handle zero width */
3134 ) {
3135 /* XXXX How to optimize if data == 0? */
3136 /* Optimize to a simpler form. */
3137 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3138 regnode *nxt2;
3139
3140 OP(oscan) = CURLYM;
3141 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3142 && (OP(nxt2) != WHILEM))
3143 nxt = nxt2;
3144 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3145 /* Need to optimize away parenths. */
3146 if (data->flags & SF_IN_PAR) {
3147 /* Set the parenth number. */
3148 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3149
3150 if (OP(nxt) != CLOSE)
3151 FAIL("Panic opt close");
3152 oscan->flags = (U8)ARG(nxt);
3153 if (RExC_open_parens) {
3154 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3155 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3156 }
8aa23a47
YO
3157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3159
c277df42 3160#ifdef DEBUGGING
8aa23a47
YO
3161 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3162 OP(nxt + 1) = OPTIMIZED; /* was count. */
3163 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3164 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3165#endif
c277df42 3166#if 0
8aa23a47
YO
3167 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3168 regnode *nnxt = regnext(nxt1);
3169
3170 if (nnxt == nxt) {
3171 if (reg_off_by_arg[OP(nxt1)])
3172 ARG_SET(nxt1, nxt2 - nxt1);
3173 else if (nxt2 - nxt1 < U16_MAX)
3174 NEXT_OFF(nxt1) = nxt2 - nxt1;
3175 else
3176 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3177 }
8aa23a47 3178 nxt1 = nnxt;
c277df42 3179 }
5d1c421c 3180#endif
8aa23a47
YO
3181 /* Optimize again: */
3182 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3183 NULL, stopparen, recursed, NULL, 0,depth+1);
3184 }
3185 else
3186 oscan->flags = 0;
3187 }
3188 else if ((OP(oscan) == CURLYX)
3189 && (flags & SCF_WHILEM_VISITED_POS)
3190 /* See the comment on a similar expression above.
3191 However, this time it not a subexpression
3192 we care about, but the expression itself. */
3193 && (maxcount == REG_INFTY)
3194 && data && ++data->whilem_c < 16) {
3195 /* This stays as CURLYX, we can put the count/of pair. */
3196 /* Find WHILEM (as in regexec.c) */
3197 regnode *nxt = oscan + NEXT_OFF(oscan);
3198
3199 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3200 nxt += ARG(nxt);
3201 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3202 | (RExC_whilem_seen << 4)); /* On WHILEM */
3203 }
3204 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3205 pars++;
3206 if (flags & SCF_DO_SUBSTR) {
3207 SV *last_str = NULL;
3208 int counted = mincount != 0;
a0ed51b3 3209
8aa23a47
YO
3210 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3211#if defined(SPARC64_GCC_WORKAROUND)
3212 I32 b = 0;
3213 STRLEN l = 0;
3214 const char *s = NULL;
3215 I32 old = 0;
b515a41d 3216
8aa23a47
YO
3217 if (pos_before >= data->last_start_min)
3218 b = pos_before;
3219 else
3220 b = data->last_start_min;
b515a41d 3221
8aa23a47
YO
3222 l = 0;
3223 s = SvPV_const(data->last_found, l);
3224 old = b - data->last_start_min;
3225
3226#else
3227 I32 b = pos_before >= data->last_start_min
3228 ? pos_before : data->last_start_min;
3229 STRLEN l;
3230 const char * const s = SvPV_const(data->last_found, l);
3231 I32 old = b - data->last_start_min;
3232#endif
3233
3234 if (UTF)
3235 old = utf8_hop((U8*)s, old) - (U8*)s;
3236
3237 l -= old;
3238 /* Get the added string: */
3239 last_str = newSVpvn(s + old, l);
3240 if (UTF)
3241 SvUTF8_on(last_str);
3242 if (deltanext == 0 && pos_before == b) {
3243 /* What was added is a constant string */
3244 if (mincount > 1) {
3245 SvGROW(last_str, (mincount * l) + 1);
3246 repeatcpy(SvPVX(last_str) + l,
3247 SvPVX_const(last_str), l, mincount - 1);
3248 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3249 /* Add additional parts. */
3250 SvCUR_set(data->last_found,
3251 SvCUR(data->last_found) - l);
3252 sv_catsv(data->last_found, last_str);
3253 {
3254 SV * sv = data->last_found;
3255 MAGIC *mg =
3256 SvUTF8(sv) && SvMAGICAL(sv) ?
3257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3258 if (mg && mg->mg_len >= 0)
3259 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3260 }
8aa23a47 3261 data->last_end += l * (mincount - 1);
b515a41d 3262 }
8aa23a47
YO
3263 } else {
3264 /* start offset must point into the last copy */
3265 data->last_start_min += minnext * (mincount - 1);
3266 data->last_start_max += is_inf ? I32_MAX
3267 : (maxcount - 1) * (minnext + data->pos_delta);
3268 }
c277df42 3269 }
8aa23a47
YO
3270 /* It is counted once already... */
3271 data->pos_min += minnext * (mincount - counted);
3272 data->pos_delta += - counted * deltanext +
3273 (minnext + deltanext) * maxcount - minnext * mincount;
3274 if (mincount != maxcount) {
3275 /* Cannot extend fixed substrings found inside
3276 the group. */
304ee84b 3277 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3278 if (mincount && last_str) {
3279 SV * const sv = data->last_found;
3280 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3281 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3282
3283 if (mg)
3284 mg->mg_len = -1;
3285 sv_setsv(sv, last_str);
3286 data->last_end = data->pos_min;
3287 data->last_start_min =
3288 data->pos_min - CHR_SVLEN(last_str);
3289 data->last_start_max = is_inf
3290 ? I32_MAX
3291 : data->pos_min + data->pos_delta
3292 - CHR_SVLEN(last_str);
3293 }
3294 data->longest = &(data->longest_float);
3295 }
3296 SvREFCNT_dec(last_str);
c277df42 3297 }
8aa23a47
YO
3298 if (data && (fl & SF_HAS_EVAL))
3299 data->flags |= SF_HAS_EVAL;
3300 optimize_curly_tail:
3301 if (OP(oscan) != CURLYX) {
3302 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3303 && NEXT_OFF(next))
3304 NEXT_OFF(oscan) += NEXT_OFF(next);
3305 }
3306 continue;
3307 default: /* REF and CLUMP only? */
3308 if (flags & SCF_DO_SUBSTR) {
304ee84b 3309 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3310 data->longest = &(data->longest_float);
3311 }
3312 is_inf = is_inf_internal = 1;
3313 if (flags & SCF_DO_STCLASS_OR)
3314 cl_anything(pRExC_state, data->start_class);
3315 flags &= ~SCF_DO_STCLASS;
3316 break;
c277df42 3317 }
8aa23a47
YO
3318 }
3319 else if (strchr((const char*)PL_simple,OP(scan))) {
3320 int value = 0;
653099ff 3321
8aa23a47 3322 if (flags & SCF_DO_SUBSTR) {
304ee84b 3323 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3324 data->pos_min++;
3325 }
3326 min++;
3327 if (flags & SCF_DO_STCLASS) {
3328 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3329
8aa23a47
YO
3330 /* Some of the logic below assumes that switching
3331 locale on will only add false positives. */
3332 switch (PL_regkind[OP(scan)]) {
3333 case SANY:
3334 default:
3335 do_default:
3336 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3337 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3338 cl_anything(pRExC_state, data->start_class);
3339 break;
3340 case REG_ANY:
3341 if (OP(scan) == SANY)
3342 goto do_default;
3343 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3344 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3345 || (data->start_class->flags & ANYOF_CLASS));
3346 cl_anything(pRExC_state, data->start_class);
653099ff 3347 }
8aa23a47
YO
3348 if (flags & SCF_DO_STCLASS_AND || !value)
3349 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3350 break;
3351 case ANYOF:
3352 if (flags & SCF_DO_STCLASS_AND)
3353 cl_and(data->start_class,
3354 (struct regnode_charclass_class*)scan);
653099ff 3355 else
8aa23a47
YO
3356 cl_or(pRExC_state, data->start_class,
3357 (struct regnode_charclass_class*)scan);
3358 break;
3359 case ALNUM:
3360 if (flags & SCF_DO_STCLASS_AND) {
3361 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3363 for (value = 0; value < 256; value++)
3364 if (!isALNUM(value))
3365 ANYOF_BITMAP_CLEAR(data->start_class, value);
3366 }
653099ff 3367 }
8aa23a47
YO
3368 else {
3369 if (data->start_class->flags & ANYOF_LOCALE)
3370 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3371 else {
3372 for (value = 0; value < 256; value++)
3373 if (isALNUM(value))
3374 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3375 }
8aa23a47
YO
3376 }
3377 break;
3378 case ALNUML:
3379 if (flags & SCF_DO_STCLASS_AND) {
3380 if (data->start_class->flags & ANYOF_LOCALE)
3381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3382 }
3383 else {
3384 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3385 data->start_class->flags |= ANYOF_LOCALE;
3386 }
3387 break;
3388 case NALNUM:
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3392 for (value = 0; value < 256; value++)
3393 if (isALNUM(value))
3394 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3395 }
3396 }
8aa23a47
YO
3397 else {
3398 if (data->start_class->flags & ANYOF_LOCALE)
3399 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3400 else {
3401 for (value = 0; value < 256; value++)
3402 if (!isALNUM(value))
3403 ANYOF_BITMAP_SET(data->start_class, value);
3404 }
653099ff 3405 }
8aa23a47
YO
3406 break;
3407 case NALNUML:
3408 if (flags & SCF_DO_STCLASS_AND) {
3409 if (data->start_class->flags & ANYOF_LOCALE)
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3411 }
8aa23a47
YO
3412 else {
3413 data->start_class->flags |= ANYOF_LOCALE;
3414 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3415 }
3416 break;
3417 case SPACE:
3418 if (flags & SCF_DO_STCLASS_AND) {
3419 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3421 for (value = 0; value < 256; value++)
3422 if (!isSPACE(value))
3423 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3424 }
3425 }
8aa23a47
YO
3426 else {
3427 if (data->start_class->flags & ANYOF_LOCALE)
3428 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3429 else {
3430 for (value = 0; value < 256; value++)
3431 if (isSPACE(value))
3432 ANYOF_BITMAP_SET(data->start_class, value);
3433 }
653099ff 3434 }
8aa23a47
YO
3435 break;
3436 case SPACEL:
3437 if (flags & SCF_DO_STCLASS_AND) {
3438 if (data->start_class->flags & ANYOF_LOCALE)
3439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3440 }
3441 else {
3442 data->start_class->flags |= ANYOF_LOCALE;
3443 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3444 }
3445 break;
3446 case NSPACE:
3447 if (flags & SCF_DO_STCLASS_AND) {
3448 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3450 for (value = 0; value < 256; value++)
3451 if (isSPACE(value))
3452 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3453 }
8aa23a47
YO
3454 }
3455 else {
3456 if (data->start_class->flags & ANYOF_LOCALE)
3457 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3458 else {
3459 for (value = 0; value < 256; value++)
3460 if (!isSPACE(value))
3461 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3462 }
3463 }
8aa23a47
YO
3464 break;
3465 case NSPACEL:
3466 if (flags & SCF_DO_STCLASS_AND) {
3467 if (data->start_class->flags & ANYOF_LOCALE) {
3468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3469 for (value = 0; value < 256; value++)
3470 if (!isSPACE(value))
3471 ANYOF_BITMAP_CLEAR(data->start_class, value);
3472 }
653099ff 3473 }
8aa23a47
YO
3474 else {
3475 data->start_class->flags |= ANYOF_LOCALE;
3476 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3477 }
3478 break;
3479 case DIGIT:
3480 if (flags & SCF_DO_STCLASS_AND) {
3481 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3482 for (value = 0; value < 256; value++)
3483 if (!isDIGIT(value))
3484 ANYOF_BITMAP_CLEAR(data->start_class, value);
3485 }
3486 else {
3487 if (data->start_class->flags & ANYOF_LOCALE)
3488 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3489 else {
3490 for (value = 0; value < 256; value++)
3491 if (isDIGIT(value))
3492 ANYOF_BITMAP_SET(data->start_class, value);
3493 }
3494 }
3495 break;
3496 case NDIGIT:
3497 if (flags & SCF_DO_STCLASS_AND) {
3498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3499 for (value = 0; value < 256; value++)
3500 if (isDIGIT(value))
3501 ANYOF_BITMAP_CLEAR(data->start_class, value);
3502 }
3503 else {
3504 if (data->start_class->flags & ANYOF_LOCALE)
3505 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3506 else {
3507 for (value = 0; value < 256; value++)
3508 if (!isDIGIT(value))
3509 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3510 }
3511 }
8aa23a47
YO
3512 break;
3513 }
3514 if (flags & SCF_DO_STCLASS_OR)
3515 cl_and(data->start_class, and_withp);
3516 flags &= ~SCF_DO_STCLASS;
3517 }
3518 }
3519 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3520 data->flags |= (OP(scan) == MEOL
3521 ? SF_BEFORE_MEOL
3522 : SF_BEFORE_SEOL);
3523 }
3524 else if ( PL_regkind[OP(scan)] == BRANCHJ
3525 /* Lookbehind, or need to calculate parens/evals/stclass: */
3526 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3527 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3528 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3529 || OP(scan) == UNLESSM )
3530 {
3531 /* Negative Lookahead/lookbehind
3532 In this case we can't do fixed string optimisation.
3533 */
1de06328 3534
8aa23a47
YO
3535 I32 deltanext, minnext, fake = 0;
3536 regnode *nscan;
3537 struct regnode_charclass_class intrnl;
3538 int f = 0;
1de06328 3539
8aa23a47
YO
3540 data_fake.flags = 0;
3541 if (data) {
3542 data_fake.whilem_c = data->whilem_c;
3543 data_fake.last_closep = data->last_closep;
c277df42 3544 }
8aa23a47
YO
3545 else
3546 data_fake.last_closep = &fake;
58e23c8d 3547 data_fake.pos_delta = delta;
8aa23a47
YO
3548 if ( flags & SCF_DO_STCLASS && !scan->flags
3549 && OP(scan) == IFMATCH ) { /* Lookahead */
3550 cl_init(pRExC_state, &intrnl);
3551 data_fake.start_class = &intrnl;
3552 f |= SCF_DO_STCLASS_AND;
3553 }
3554 if (flags & SCF_WHILEM_VISITED_POS)
3555 f |= SCF_WHILEM_VISITED_POS;
3556 next = regnext(scan);
3557 nscan = NEXTOPER(NEXTOPER(scan));
3558 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3559 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3560 if (scan->flags) {
3561 if (deltanext) {
58e23c8d 3562 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3563 }
3564 else if (minnext > (I32)U8_MAX) {
58e23c8d 3565 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3566 }
3567 scan->flags = (U8)minnext;
3568 }
3569 if (data) {
3570 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3571 pars++;
3572 if (data_fake.flags & SF_HAS_EVAL)
3573 data->flags |= SF_HAS_EVAL;
3574 data->whilem_c = data_fake.whilem_c;
3575 }
3576 if (f & SCF_DO_STCLASS_AND) {
3577 const int was = (data->start_class->flags & ANYOF_EOS);
3578
3579 cl_and(data->start_class, &intrnl);
3580 if (was)
3581 data->start_class->flags |= ANYOF_EOS;
3582 }
cb434fcc 3583 }
8aa23a47
YO
3584#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3585 else {
3586 /* Positive Lookahead/lookbehind
3587 In this case we can do fixed string optimisation,
3588 but we must be careful about it. Note in the case of
3589 lookbehind the positions will be offset by the minimum
3590 length of the pattern, something we won't know about
3591 until after the recurse.
3592 */
3593 I32 deltanext, fake = 0;
3594 regnode *nscan;
3595 struct regnode_charclass_class intrnl;
3596 int f = 0;
3597 /* We use SAVEFREEPV so that when the full compile
3598 is finished perl will clean up the allocated
3599 minlens when its all done. This was we don't
3600 have to worry about freeing them when we know
3601 they wont be used, which would be a pain.
3602 */
3603 I32 *minnextp;
3604 Newx( minnextp, 1, I32 );
3605 SAVEFREEPV(minnextp);
3606
3607 if (data) {
3608 StructCopy(data, &data_fake, scan_data_t);
3609 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3610 f |= SCF_DO_SUBSTR;
3611 if (scan->flags)
304ee84b 3612 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3613 data_fake.last_found=newSVsv(data->last_found);
3614 }
3615 }
3616 else
3617 data_fake.last_closep = &fake;
3618 data_fake.flags = 0;
58e23c8d 3619 data_fake.pos_delta = delta;
8aa23a47
YO
3620 if (is_inf)
3621 data_fake.flags |= SF_IS_INF;
3622 if ( flags & SCF_DO_STCLASS && !scan->flags
3623 && OP(scan) == IFMATCH ) { /* Lookahead */
3624 cl_init(pRExC_state, &intrnl);
3625 data_fake.start_class = &intrnl;
3626 f |= SCF_DO_STCLASS_AND;
3627 }
3628 if (flags & SCF_WHILEM_VISITED_POS)
3629 f |= SCF_WHILEM_VISITED_POS;
3630 next = regnext(scan);
3631 nscan = NEXTOPER(NEXTOPER(scan));
3632
3633 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3634 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3635 if (scan->flags) {
3636 if (deltanext) {
58e23c8d 3637 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3638 }
3639 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3640 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3641 }
3642 scan->flags = (U8)*minnextp;
3643 }
3644
3645 *minnextp += min;
3646
3647 if (f & SCF_DO_STCLASS_AND) {
3648 const int was = (data->start_class->flags & ANYOF_EOS);
3649
3650 cl_and(data->start_class, &intrnl);
3651 if (was)
3652 data->start_class->flags |= ANYOF_EOS;
3653 }
3654 if (data) {
3655 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3656 pars++;
3657 if (data_fake.flags & SF_HAS_EVAL)
3658 data->flags |= SF_HAS_EVAL;
3659 data->whilem_c = data_fake.whilem_c;
3660 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3661 if (RExC_rx->minlen<*minnextp)
3662 RExC_rx->minlen=*minnextp;
304ee84b 3663 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3664 SvREFCNT_dec(data_fake.last_found);
3665
3666 if ( data_fake.minlen_fixed != minlenp )
3667 {
3668 data->offset_fixed= data_fake.offset_fixed;
3669 data->minlen_fixed= data_fake.minlen_fixed;
3670 data->lookbehind_fixed+= scan->flags;
3671 }
3672 if ( data_fake.minlen_float != minlenp )
3673 {
3674 data->minlen_float= data_fake.minlen_float;
3675 data->offset_float_min=data_fake.offset_float_min;
3676 data->offset_float_max=data_fake.offset_float_max;
3677 data->lookbehind_float+= scan->flags;
3678 }
3679 }
3680 }
3681
3682
40d049e4 3683 }
8aa23a47
YO
3684#endif
3685 }
3686 else if (OP(scan) == OPEN) {
3687 if (stopparen != (I32)ARG(scan))
3688 pars++;
3689 }
3690 else if (OP(scan) == CLOSE) {
3691 if (stopparen == (I32)ARG(scan)) {
3692 break;
3693 }
3694 if ((I32)ARG(scan) == is_par) {
3695 next = regnext(scan);
b515a41d 3696
8aa23a47
YO
3697 if ( next && (OP(next) != WHILEM) && next < last)
3698 is_par = 0; /* Disable optimization */
40d049e4 3699 }
8aa23a47
YO
3700 if (data)
3701 *(data->last_closep) = ARG(scan);
3702 }
3703 else if (OP(scan) == EVAL) {
c277df42
IZ
3704 if (data)
3705 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3706 }
3707 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3708 if (flags & SCF_DO_SUBSTR) {
304ee84b 3709 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3710 flags &= ~SCF_DO_SUBSTR;
40d049e4 3711 }
8aa23a47
YO
3712 if (data && OP(scan)==ACCEPT) {
3713 data->flags |= SCF_SEEN_ACCEPT;
3714 if (stopmin > min)
3715 stopmin = min;
e2e6a0f1 3716 }
8aa23a47
YO
3717 }
3718 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3719 {
0f5d15d6 3720 if (flags & SCF_DO_SUBSTR) {
304ee84b 3721 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3722 data->longest = &(data->longest_float);
3723 }
3724 is_inf = is_inf_internal = 1;
653099ff 3725 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3726 cl_anything(pRExC_state, data->start_class);
96776eda 3727 flags &= ~SCF_DO_STCLASS;
8aa23a47 3728 }
58e23c8d 3729 else if (OP(scan) == GPOS) {
bbe252da 3730 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3731 !(delta || is_inf || (data && data->pos_delta)))
3732 {
bbe252da
YO
3733 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3734 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3735 if (RExC_rx->gofs < (U32)min)
3736 RExC_rx->gofs = min;
3737 } else {
bbe252da 3738 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3739 RExC_rx->gofs = 0;
3740 }
3741 }
786e8c11 3742#ifdef TRIE_STUDY_OPT
40d049e4 3743#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3744 else if (PL_regkind[OP(scan)] == TRIE) {
3745 /* NOTE - There is similar code to this block above for handling
3746 BRANCH nodes on the initial study. If you change stuff here
3747 check there too. */
3748 regnode *trie_node= scan;
3749 regnode *tail= regnext(scan);
f8fc2ecf 3750 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3751 I32 max1 = 0, min1 = I32_MAX;
3752 struct regnode_charclass_class accum;
3753
3754 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3755 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3756 if (flags & SCF_DO_STCLASS)
3757 cl_init_zero(pRExC_state, &accum);
3758
3759 if (!trie->jump) {
3760 min1= trie->minlen;
3761 max1= trie->maxlen;
3762 } else {
3763 const regnode *nextbranch= NULL;
3764 U32 word;
3765
3766 for ( word=1 ; word <= trie->wordcount ; word++)
3767 {
3768 I32 deltanext=0, minnext=0, f = 0, fake;
3769 struct regnode_charclass_class this_class;
3770
3771 data_fake.flags = 0;
3772 if (data) {
3773 data_fake.whilem_c = data->whilem_c;
3774 data_fake.last_closep = data->last_closep;
3775 }
3776 else
3777 data_fake.last_closep = &fake;
58e23c8d 3778 data_fake.pos_delta = delta;
8aa23a47
YO
3779 if (flags & SCF_DO_STCLASS) {
3780 cl_init(pRExC_state, &this_class);
3781 data_fake.start_class = &this_class;
3782 f = SCF_DO_STCLASS_AND;
3783 }
3784 if (flags & SCF_WHILEM_VISITED_POS)
3785 f |= SCF_WHILEM_VISITED_POS;
3786
3787 if (trie->jump[word]) {
3788 if (!nextbranch)
3789 nextbranch = trie_node + trie->jump[0];
3790 scan= trie_node + trie->jump[word];
3791 /* We go from the jump point to the branch that follows
3792 it. Note this means we need the vestigal unused branches
3793 even though they arent otherwise used.
3794 */
3795 minnext = study_chunk(pRExC_state, &scan, minlenp,
3796 &deltanext, (regnode *)nextbranch, &data_fake,
3797 stopparen, recursed, NULL, f,depth+1);
3798 }
3799 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3800 nextbranch= regnext((regnode*)nextbranch);
3801
3802 if (min1 > (I32)(minnext + trie->minlen))
3803 min1 = minnext + trie->minlen;
3804 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3805 max1 = minnext + deltanext + trie->maxlen;
3806 if (deltanext == I32_MAX)
3807 is_inf = is_inf_internal = 1;
3808
3809 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3810 pars++;
3811 if (data_fake.flags & SCF_SEEN_ACCEPT) {