This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Vadim did the static build change
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
154#ifdef RE_TRACK_PATTERN_OFFSETS
155#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
156#endif
830247a4 157#define RExC_emit (pRExC_state->emit)
fac92740 158#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
159#define RExC_naughty (pRExC_state->naughty)
160#define RExC_sawback (pRExC_state->sawback)
161#define RExC_seen (pRExC_state->seen)
162#define RExC_size (pRExC_state->size)
163#define RExC_npar (pRExC_state->npar)
e2e6a0f1 164#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
165#define RExC_extralen (pRExC_state->extralen)
166#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
167#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 168#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 169#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
170#define RExC_open_parens (pRExC_state->open_parens)
171#define RExC_close_parens (pRExC_state->close_parens)
172#define RExC_opend (pRExC_state->opend)
81714fb9 173#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
174#define RExC_recurse (pRExC_state->recurse)
175#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 176
cde0cee5 177
a687059c
LW
178#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
179#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
180 ((*s) == '{' && regcurly(s)))
a687059c 181
35c8bce7
LW
182#ifdef SPSTART
183#undef SPSTART /* dratted cpp namespace... */
184#endif
a687059c
LW
185/*
186 * Flags to be passed up and down.
187 */
a687059c 188#define WORST 0 /* Worst case. */
821b33a5 189#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
190#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
191#define SPSTART 0x4 /* Starts with * or +. */
192#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 193
3dab1dad
YO
194#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
195
07be1b83
YO
196/* whether trie related optimizations are enabled */
197#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
198#define TRIE_STUDY_OPT
786e8c11 199#define FULL_TRIE_STUDY
07be1b83
YO
200#define TRIE_STCLASS
201#endif
1de06328
YO
202
203
40d049e4
YO
204
205#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
206#define PBITVAL(paren) (1 << ((paren) & 7))
207#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
208#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
209#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
210
211
1de06328
YO
212/* About scan_data_t.
213
214 During optimisation we recurse through the regexp program performing
215 various inplace (keyhole style) optimisations. In addition study_chunk
216 and scan_commit populate this data structure with information about
217 what strings MUST appear in the pattern. We look for the longest
218 string that must appear for at a fixed location, and we look for the
219 longest string that may appear at a floating location. So for instance
220 in the pattern:
221
222 /FOO[xX]A.*B[xX]BAR/
223
224 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
225 strings (because they follow a .* construct). study_chunk will identify
226 both FOO and BAR as being the longest fixed and floating strings respectively.
227
228 The strings can be composites, for instance
229
230 /(f)(o)(o)/
231
232 will result in a composite fixed substring 'foo'.
233
234 For each string some basic information is maintained:
235
236 - offset or min_offset
237 This is the position the string must appear at, or not before.
238 It also implicitly (when combined with minlenp) tells us how many
239 character must match before the string we are searching.
240 Likewise when combined with minlenp and the length of the string
241 tells us how many characters must appear after the string we have
242 found.
243
244 - max_offset
245 Only used for floating strings. This is the rightmost point that
246 the string can appear at. Ifset to I32 max it indicates that the
247 string can occur infinitely far to the right.
248
249 - minlenp
250 A pointer to the minimum length of the pattern that the string
251 was found inside. This is important as in the case of positive
252 lookahead or positive lookbehind we can have multiple patterns
253 involved. Consider
254
255 /(?=FOO).*F/
256
257 The minimum length of the pattern overall is 3, the minimum length
258 of the lookahead part is 3, but the minimum length of the part that
259 will actually match is 1. So 'FOO's minimum length is 3, but the
260 minimum length for the F is 1. This is important as the minimum length
261 is used to determine offsets in front of and behind the string being
262 looked for. Since strings can be composites this is the length of the
263 pattern at the time it was commited with a scan_commit. Note that
264 the length is calculated by study_chunk, so that the minimum lengths
265 are not known until the full pattern has been compiled, thus the
266 pointer to the value.
267
268 - lookbehind
269
270 In the case of lookbehind the string being searched for can be
271 offset past the start point of the final matching string.
272 If this value was just blithely removed from the min_offset it would
273 invalidate some of the calculations for how many chars must match
274 before or after (as they are derived from min_offset and minlen and
275 the length of the string being searched for).
276 When the final pattern is compiled and the data is moved from the
277 scan_data_t structure into the regexp structure the information
278 about lookbehind is factored in, with the information that would
279 have been lost precalculated in the end_shift field for the
280 associated string.
281
282 The fields pos_min and pos_delta are used to store the minimum offset
283 and the delta to the maximum offset at the current point in the pattern.
284
285*/
2c2d71f5
JH
286
287typedef struct scan_data_t {
1de06328
YO
288 /*I32 len_min; unused */
289 /*I32 len_delta; unused */
2c2d71f5
JH
290 I32 pos_min;
291 I32 pos_delta;
292 SV *last_found;
1de06328 293 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
294 I32 last_start_min;
295 I32 last_start_max;
1de06328
YO
296 SV **longest; /* Either &l_fixed, or &l_float. */
297 SV *longest_fixed; /* longest fixed string found in pattern */
298 I32 offset_fixed; /* offset where it starts */
299 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
301 SV *longest_float; /* longest floating string found in pattern */
302 I32 offset_float_min; /* earliest point in string it can appear */
303 I32 offset_float_max; /* latest point in string it can appear */
304 I32 *minlen_float; /* pointer to the minlen relevent to the string */
305 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
306 I32 flags;
307 I32 whilem_c;
cb434fcc 308 I32 *last_closep;
653099ff 309 struct regnode_charclass_class *start_class;
2c2d71f5
JH
310} scan_data_t;
311
a687059c 312/*
e50aee73 313 * Forward declarations for pregcomp()'s friends.
a687059c 314 */
a0d0e21e 315
27da23d5 316static const scan_data_t zero_scan_data =
1de06328 317 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
318
319#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
320#define SF_BEFORE_SEOL 0x0001
321#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
322#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
323#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
324
09b7f37c
CB
325#ifdef NO_UNARY_PLUS
326# define SF_FIX_SHIFT_EOL (0+2)
327# define SF_FL_SHIFT_EOL (0+4)
328#else
329# define SF_FIX_SHIFT_EOL (+2)
330# define SF_FL_SHIFT_EOL (+4)
331#endif
c277df42
IZ
332
333#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
334#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
335
336#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
337#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
338#define SF_IS_INF 0x0040
339#define SF_HAS_PAR 0x0080
340#define SF_IN_PAR 0x0100
341#define SF_HAS_EVAL 0x0200
342#define SCF_DO_SUBSTR 0x0400
653099ff
GS
343#define SCF_DO_STCLASS_AND 0x0800
344#define SCF_DO_STCLASS_OR 0x1000
345#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 346#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 347
786e8c11 348#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 349#define SCF_SEEN_ACCEPT 0x8000
07be1b83 350
eb160463 351#define UTF (RExC_utf8 != 0)
bbe252da
YO
352#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
353#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 354
ffc61ed2 355#define OOB_UNICODE 12345678
93733859 356#define OOB_NAMEDCLASS -1
b8c5462f 357
a0ed51b3
LW
358#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
359#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
360
8615cb43 361
b45f050a
JF
362/* length of regex to show in messages that don't mark a position within */
363#define RegexLengthToShowInErrorMessages 127
364
365/*
366 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
367 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
368 * op/pragma/warn/regcomp.
369 */
7253e4e3
RK
370#define MARKER1 "<-- HERE" /* marker as it appears in the description */
371#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 372
7253e4e3 373#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
374
375/*
376 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
377 * arg. Show regex, up to a maximum length. If it's too long, chop and add
378 * "...".
379 */
58e23c8d 380#define _FAIL(code) STMT_START { \
bfed75c6 381 const char *ellipses = ""; \
ccb2c380
MP
382 IV len = RExC_end - RExC_precomp; \
383 \
384 if (!SIZE_ONLY) \
385 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
386 if (len > RegexLengthToShowInErrorMessages) { \
387 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
388 len = RegexLengthToShowInErrorMessages - 10; \
389 ellipses = "..."; \
390 } \
58e23c8d 391 code; \
ccb2c380 392} STMT_END
8615cb43 393
58e23c8d
YO
394#define FAIL(msg) _FAIL( \
395 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
396 msg, (int)len, RExC_precomp, ellipses))
397
398#define FAIL2(msg,arg) _FAIL( \
399 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
400 arg, (int)len, RExC_precomp, ellipses))
401
b45f050a 402/*
b45f050a
JF
403 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
404 */
ccb2c380 405#define Simple_vFAIL(m) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
408 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
413 */
ccb2c380
MP
414#define vFAIL(m) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL(m); \
418} STMT_END
b45f050a
JF
419
420/*
421 * Like Simple_vFAIL(), but accepts two arguments.
422 */
ccb2c380 423#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 424 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
425 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
426 (int)offset, RExC_precomp, RExC_precomp + offset); \
427} STMT_END
b45f050a
JF
428
429/*
430 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
431 */
ccb2c380
MP
432#define vFAIL2(m,a1) STMT_START { \
433 if (!SIZE_ONLY) \
434 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
435 Simple_vFAIL2(m, a1); \
436} STMT_END
b45f050a
JF
437
438
439/*
440 * Like Simple_vFAIL(), but accepts three arguments.
441 */
ccb2c380 442#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a
JF
447
448/*
449 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
450 */
ccb2c380
MP
451#define vFAIL3(m,a1,a2) STMT_START { \
452 if (!SIZE_ONLY) \
453 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
454 Simple_vFAIL3(m, a1, a2); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Like Simple_vFAIL(), but accepts four arguments.
459 */
ccb2c380 460#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 461 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
462 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
463 (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
b45f050a 465
ccb2c380 466#define vWARN(loc,m) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARNdep(loc,m) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
475 "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
478
479
480#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 487 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
488 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
490} STMT_END
491
492#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
497
498#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 499 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
500 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
501 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
9d1d55b5 503
8615cb43 504
cd439c50 505/* Allow for side effects in s */
ccb2c380
MP
506#define REGC(c,s) STMT_START { \
507 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
508} STMT_END
cd439c50 509
fac92740
MJD
510/* Macros for recording node offsets. 20001227 mjd@plover.com
511 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
512 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
513 * Element 0 holds the number n.
07be1b83 514 * Position is 1 indexed.
fac92740 515 */
7122b237
YO
516#ifndef RE_TRACK_PATTERN_OFFSETS
517#define Set_Node_Offset_To_R(node,byte)
518#define Set_Node_Offset(node,byte)
519#define Set_Cur_Node_Offset
520#define Set_Node_Length_To_R(node,len)
521#define Set_Node_Length(node,len)
522#define Set_Node_Cur_Length(node)
523#define Node_Offset(n)
524#define Node_Length(n)
525#define Set_Node_Offset_Length(node,offset,len)
526#define ProgLen(ri) ri->u.proglen
527#define SetProgLen(ri,x) ri->u.proglen = x
528#else
529#define ProgLen(ri) ri->u.offsets[0]
530#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
531#define Set_Node_Offset_To_R(node,byte) STMT_START { \
532 if (! SIZE_ONLY) { \
533 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 534 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 535 if((node) < 0) { \
551405c4 536 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
537 } else { \
538 RExC_offsets[2*(node)-1] = (byte); \
539 } \
540 } \
541} STMT_END
542
543#define Set_Node_Offset(node,byte) \
544 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
545#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
546
547#define Set_Node_Length_To_R(node,len) STMT_START { \
548 if (! SIZE_ONLY) { \
549 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 550 __LINE__, (int)(node), (int)(len))); \
ccb2c380 551 if((node) < 0) { \
551405c4 552 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
553 } else { \
554 RExC_offsets[2*(node)] = (len); \
555 } \
556 } \
557} STMT_END
558
559#define Set_Node_Length(node,len) \
560 Set_Node_Length_To_R((node)-RExC_emit_start, len)
561#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
562#define Set_Node_Cur_Length(node) \
563 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
564
565/* Get offsets and lengths */
566#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
567#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
568
07be1b83
YO
569#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
570 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
571 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
572} STMT_END
7122b237 573#endif
07be1b83
YO
574
575#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
576#define EXPERIMENTAL_INPLACESCAN
7122b237 577#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 578
304ee84b
YO
579#define DEBUG_STUDYDATA(str,data,depth) \
580DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 581 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
582 "%*s" str "Pos:%"IVdf"/%"IVdf \
583 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
584 (int)(depth)*2, "", \
585 (IV)((data)->pos_min), \
586 (IV)((data)->pos_delta), \
304ee84b 587 (UV)((data)->flags), \
1de06328 588 (IV)((data)->whilem_c), \
304ee84b
YO
589 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
590 is_inf ? "INF " : "" \
1de06328
YO
591 ); \
592 if ((data)->last_found) \
593 PerlIO_printf(Perl_debug_log, \
594 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
595 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
596 SvPVX_const((data)->last_found), \
597 (IV)((data)->last_end), \
598 (IV)((data)->last_start_min), \
599 (IV)((data)->last_start_max), \
600 ((data)->longest && \
601 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
602 SvPVX_const((data)->longest_fixed), \
603 (IV)((data)->offset_fixed), \
604 ((data)->longest && \
605 (data)->longest==&((data)->longest_float)) ? "*" : "", \
606 SvPVX_const((data)->longest_float), \
607 (IV)((data)->offset_float_min), \
608 (IV)((data)->offset_float_max) \
609 ); \
610 PerlIO_printf(Perl_debug_log,"\n"); \
611});
612
acfe0abc 613static void clear_re(pTHX_ void *r);
4327152a 614
653099ff 615/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 616 Update the longest found anchored substring and the longest found
653099ff
GS
617 floating substrings if needed. */
618
4327152a 619STATIC void
304ee84b 620S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 621{
e1ec3a88
AL
622 const STRLEN l = CHR_SVLEN(data->last_found);
623 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 624 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 625
c277df42 626 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 627 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
628 if (*data->longest == data->longest_fixed) {
629 data->offset_fixed = l ? data->last_start_min : data->pos_min;
630 if (data->flags & SF_BEFORE_EOL)
b81d288d 631 data->flags
c277df42
IZ
632 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
633 else
634 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
635 data->minlen_fixed=minlenp;
636 data->lookbehind_fixed=0;
a0ed51b3 637 }
304ee84b 638 else { /* *data->longest == data->longest_float */
c277df42 639 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
640 data->offset_float_max = (l
641 ? data->last_start_max
c277df42 642 : data->pos_min + data->pos_delta);
304ee84b 643 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 644 data->offset_float_max = I32_MAX;
c277df42 645 if (data->flags & SF_BEFORE_EOL)
b81d288d 646 data->flags
c277df42
IZ
647 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
648 else
649 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
650 data->minlen_float=minlenp;
651 data->lookbehind_float=0;
c277df42
IZ
652 }
653 }
654 SvCUR_set(data->last_found, 0);
0eda9292 655 {
a28509cc 656 SV * const sv = data->last_found;
097eb12c
AL
657 if (SvUTF8(sv) && SvMAGICAL(sv)) {
658 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
659 if (mg)
660 mg->mg_len = 0;
661 }
0eda9292 662 }
c277df42
IZ
663 data->last_end = -1;
664 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 665 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
666}
667
653099ff
GS
668/* Can match anything (initialization) */
669STATIC void
097eb12c 670S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 671{
653099ff 672 ANYOF_CLASS_ZERO(cl);
f8bef550 673 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 674 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
675 if (LOC)
676 cl->flags |= ANYOF_LOCALE;
677}
678
679/* Can match anything (initialization) */
680STATIC int
5f66b61c 681S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
682{
683 int value;
684
aaa51d5e 685 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
686 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
687 return 1;
1aa99e6b
IH
688 if (!(cl->flags & ANYOF_UNICODE_ALL))
689 return 0;
10edeb5d 690 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 691 return 0;
653099ff
GS
692 return 1;
693}
694
695/* Can match anything (initialization) */
696STATIC void
097eb12c 697S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 698{
8ecf7187 699 Zero(cl, 1, struct regnode_charclass_class);
653099ff 700 cl->type = ANYOF;
830247a4 701 cl_anything(pRExC_state, cl);
653099ff
GS
702}
703
704STATIC void
097eb12c 705S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 706{
8ecf7187 707 Zero(cl, 1, struct regnode_charclass_class);
653099ff 708 cl->type = ANYOF;
830247a4 709 cl_anything(pRExC_state, cl);
653099ff
GS
710 if (LOC)
711 cl->flags |= ANYOF_LOCALE;
712}
713
714/* 'And' a given class with another one. Can create false positives */
715/* We assume that cl is not inverted */
716STATIC void
5f66b61c 717S_cl_and(struct regnode_charclass_class *cl,
a28509cc 718 const struct regnode_charclass_class *and_with)
653099ff 719{
40d049e4
YO
720
721 assert(and_with->type == ANYOF);
653099ff
GS
722 if (!(and_with->flags & ANYOF_CLASS)
723 && !(cl->flags & ANYOF_CLASS)
724 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
725 && !(and_with->flags & ANYOF_FOLD)
726 && !(cl->flags & ANYOF_FOLD)) {
727 int i;
728
729 if (and_with->flags & ANYOF_INVERT)
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] &= ~and_with->bitmap[i];
732 else
733 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
734 cl->bitmap[i] &= and_with->bitmap[i];
735 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
736 if (!(and_with->flags & ANYOF_EOS))
737 cl->flags &= ~ANYOF_EOS;
1aa99e6b 738
14ebb1a2
JH
739 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
740 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
741 cl->flags &= ~ANYOF_UNICODE_ALL;
742 cl->flags |= ANYOF_UNICODE;
743 ARG_SET(cl, ARG(and_with));
744 }
14ebb1a2
JH
745 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
746 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 747 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
748 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
749 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 750 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
751}
752
753/* 'OR' a given class with another one. Can create false positives */
754/* We assume that cl is not inverted */
755STATIC void
097eb12c 756S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 757{
653099ff
GS
758 if (or_with->flags & ANYOF_INVERT) {
759 /* We do not use
760 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
761 * <= (B1 | !B2) | (CL1 | !CL2)
762 * which is wasteful if CL2 is small, but we ignore CL2:
763 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
764 * XXXX Can we handle case-fold? Unclear:
765 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
766 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
767 */
768 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(or_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD) ) {
771 int i;
772
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] |= ~or_with->bitmap[i];
775 } /* XXXX: logic is complicated otherwise */
776 else {
830247a4 777 cl_anything(pRExC_state, cl);
653099ff
GS
778 }
779 } else {
780 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
781 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 782 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
783 || (cl->flags & ANYOF_FOLD)) ) {
784 int i;
785
786 /* OR char bitmap and class bitmap separately */
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] |= or_with->bitmap[i];
789 if (or_with->flags & ANYOF_CLASS) {
790 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
791 cl->classflags[i] |= or_with->classflags[i];
792 cl->flags |= ANYOF_CLASS;
793 }
794 }
795 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 796 cl_anything(pRExC_state, cl);
653099ff
GS
797 }
798 }
799 if (or_with->flags & ANYOF_EOS)
800 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
801
802 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
803 ARG(cl) != ARG(or_with)) {
804 cl->flags |= ANYOF_UNICODE_ALL;
805 cl->flags &= ~ANYOF_UNICODE;
806 }
807 if (or_with->flags & ANYOF_UNICODE_ALL) {
808 cl->flags |= ANYOF_UNICODE_ALL;
809 cl->flags &= ~ANYOF_UNICODE;
810 }
653099ff
GS
811}
812
a3621e74
YO
813#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
814#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
815#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
816#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
817
3dab1dad
YO
818
819#ifdef DEBUGGING
07be1b83 820/*
2b8b4781
NC
821 dump_trie(trie,widecharmap,revcharmap)
822 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
823 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
824
825 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
826 The _interim_ variants are used for debugging the interim
827 tables that are used to generate the final compressed
828 representation which is what dump_trie expects.
829
3dab1dad
YO
830 Part of the reason for their existance is to provide a form
831 of documentation as to how the different representations function.
07be1b83
YO
832
833*/
3dab1dad
YO
834
835/*
3dab1dad
YO
836 Dumps the final compressed table form of the trie to Perl_debug_log.
837 Used for debugging make_trie().
838*/
839
840STATIC void
2b8b4781
NC
841S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
842 AV *revcharmap, U32 depth)
3dab1dad
YO
843{
844 U32 state;
ab3bbdeb 845 SV *sv=sv_newmortal();
55eed653 846 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
847 GET_RE_DEBUG_FLAGS_DECL;
848
ab3bbdeb 849
3dab1dad
YO
850 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
851 (int)depth * 2 + 2,"",
852 "Match","Base","Ofs" );
853
854 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 855 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 856 if ( tmp ) {
ab3bbdeb
YO
857 PerlIO_printf( Perl_debug_log, "%*s",
858 colwidth,
ddc5bc0f 859 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
860 PL_colors[0], PL_colors[1],
861 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
862 PERL_PV_ESCAPE_FIRSTCHAR
863 )
864 );
3dab1dad
YO
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 871 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
872 PerlIO_printf( Perl_debug_log, "\n");
873
1e2e3d02 874 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*"UVXf,
903 colwidth,
3dab1dad
YO
904 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 } else {
ab3bbdeb 906 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
907 }
908 }
909
910 PerlIO_printf( Perl_debug_log, "]");
911
912 }
913 PerlIO_printf( Perl_debug_log, "\n" );
914 }
915}
916/*
3dab1dad
YO
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
55eed653 923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
924 HV *widecharmap, AV *revcharmap, U32 next_alloc,
925 U32 depth)
3dab1dad
YO
926{
927 U32 state;
ab3bbdeb 928 SV *sv=sv_newmortal();
55eed653 929 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
930 GET_RE_DEBUG_FLAGS_DECL;
931 /* print out the table precompression. */
ab3bbdeb
YO
932 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
933 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
934 "------:-----+-----------------\n" );
3dab1dad
YO
935
936 for( state=1 ; state < next_alloc ; state ++ ) {
937 U16 charid;
938
ab3bbdeb 939 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
940 (int)depth * 2 + 2,"", (UV)state );
941 if ( ! trie->states[ state ].wordnum ) {
942 PerlIO_printf( Perl_debug_log, "%5s| ","");
943 } else {
944 PerlIO_printf( Perl_debug_log, "W%4x| ",
945 trie->states[ state ].wordnum
946 );
947 }
948 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 949 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
950 if ( tmp ) {
951 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
952 colwidth,
ddc5bc0f 953 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
954 PL_colors[0], PL_colors[1],
955 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
956 PERL_PV_ESCAPE_FIRSTCHAR
957 ) ,
1e2e3d02
YO
958 TRIE_LIST_ITEM(state,charid).forid,
959 (UV)TRIE_LIST_ITEM(state,charid).newstate
960 );
961 if (!(charid % 10))
664e119d
RGS
962 PerlIO_printf(Perl_debug_log, "\n%*s| ",
963 (int)((depth * 2) + 14), "");
1e2e3d02 964 }
ab3bbdeb
YO
965 }
966 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
967 }
968}
969
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in table form.
972 This is the normal DFA style state transition table, with a few
973 twists to facilitate compression later.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
982 U16 charid;
ab3bbdeb 983 SV *sv=sv_newmortal();
55eed653 984 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
985 GET_RE_DEBUG_FLAGS_DECL;
986
987 /*
988 print out the table precompression so that we can do a visual check
989 that they are identical.
990 */
991
992 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
993
994 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 995 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 996 if ( tmp ) {
ab3bbdeb
YO
997 PerlIO_printf( Perl_debug_log, "%*s",
998 colwidth,
ddc5bc0f 999 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1000 PL_colors[0], PL_colors[1],
1001 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1002 PERL_PV_ESCAPE_FIRSTCHAR
1003 )
1004 );
3dab1dad
YO
1005 }
1006 }
1007
1008 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1009
1010 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1011 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1012 }
1013
1014 PerlIO_printf( Perl_debug_log, "\n" );
1015
1016 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1017
1018 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1019 (int)depth * 2 + 2,"",
1020 (UV)TRIE_NODENUM( state ) );
1021
1022 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1023 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1024 if (v)
1025 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1026 else
1027 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1028 }
1029 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1030 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1031 } else {
1032 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1033 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1034 }
1035 }
07be1b83 1036}
3dab1dad
YO
1037
1038#endif
1039
786e8c11
YO
1040/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1041 startbranch: the first branch in the whole branch sequence
1042 first : start branch of sequence of branch-exact nodes.
1043 May be the same as startbranch
1044 last : Thing following the last branch.
1045 May be the same as tail.
1046 tail : item following the branch sequence
1047 count : words in the sequence
1048 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1049 depth : indent depth
3dab1dad 1050
786e8c11 1051Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1052
786e8c11
YO
1053A trie is an N'ary tree where the branches are determined by digital
1054decomposition of the key. IE, at the root node you look up the 1st character and
1055follow that branch repeat until you find the end of the branches. Nodes can be
1056marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1057
786e8c11 1058 /he|she|his|hers/
72f13be8 1059
786e8c11
YO
1060would convert into the following structure. Numbers represent states, letters
1061following numbers represent valid transitions on the letter from that state, if
1062the number is in square brackets it represents an accepting state, otherwise it
1063will be in parenthesis.
07be1b83 1064
786e8c11
YO
1065 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1066 | |
1067 | (2)
1068 | |
1069 (1) +-i->(6)-+-s->[7]
1070 |
1071 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1072
786e8c11
YO
1073 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1074
1075This shows that when matching against the string 'hers' we will begin at state 1
1076read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1077then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1078is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1079single traverse. We store a mapping from accepting to state to which word was
1080matched, and then when we have multiple possibilities we try to complete the
1081rest of the regex in the order in which they occured in the alternation.
1082
1083The only prior NFA like behaviour that would be changed by the TRIE support is
1084the silent ignoring of duplicate alternations which are of the form:
1085
1086 / (DUPE|DUPE) X? (?{ ... }) Y /x
1087
1088Thus EVAL blocks follwing a trie may be called a different number of times with
1089and without the optimisation. With the optimisations dupes will be silently
1090ignored. This inconsistant behaviour of EVAL type nodes is well established as
1091the following demonstrates:
1092
1093 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1094
1095which prints out 'word' three times, but
1096
1097 'words'=~/(word|word|word)(?{ print $1 })S/
1098
1099which doesnt print it out at all. This is due to other optimisations kicking in.
1100
1101Example of what happens on a structural level:
1102
1103The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1104
1105 1: CURLYM[1] {1,32767}(18)
1106 5: BRANCH(8)
1107 6: EXACT <ac>(16)
1108 8: BRANCH(11)
1109 9: EXACT <ad>(16)
1110 11: BRANCH(14)
1111 12: EXACT <ab>(16)
1112 16: SUCCEED(0)
1113 17: NOTHING(18)
1114 18: END(0)
1115
1116This would be optimizable with startbranch=5, first=5, last=16, tail=16
1117and should turn into:
1118
1119 1: CURLYM[1] {1,32767}(18)
1120 5: TRIE(16)
1121 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1122 <ac>
1123 <ad>
1124 <ab>
1125 16: SUCCEED(0)
1126 17: NOTHING(18)
1127 18: END(0)
1128
1129Cases where tail != last would be like /(?foo|bar)baz/:
1130
1131 1: BRANCH(4)
1132 2: EXACT <foo>(8)
1133 4: BRANCH(7)
1134 5: EXACT <bar>(8)
1135 7: TAIL(8)
1136 8: EXACT <baz>(10)
1137 10: END(0)
1138
1139which would be optimizable with startbranch=1, first=1, last=7, tail=8
1140and would end up looking like:
1141
1142 1: TRIE(8)
1143 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1144 <foo>
1145 <bar>
1146 7: TAIL(8)
1147 8: EXACT <baz>(10)
1148 10: END(0)
1149
1150 d = uvuni_to_utf8_flags(d, uv, 0);
1151
1152is the recommended Unicode-aware way of saying
1153
1154 *(d++) = uv;
1155*/
1156
1e2e3d02 1157#define TRIE_STORE_REVCHAR \
786e8c11 1158 STMT_START { \
1e2e3d02 1159 SV *tmp = newSVpvs(""); \
786e8c11 1160 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1161 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1162 av_push( revcharmap, tmp ); \
786e8c11
YO
1163 } STMT_END
1164
1165#define TRIE_READ_CHAR STMT_START { \
1166 wordlen++; \
1167 if ( UTF ) { \
1168 if ( folder ) { \
1169 if ( foldlen > 0 ) { \
1170 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1171 foldlen -= len; \
1172 scan += len; \
1173 len = 0; \
1174 } else { \
1175 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1176 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1177 foldlen -= UNISKIP( uvc ); \
1178 scan = foldbuf + UNISKIP( uvc ); \
1179 } \
1180 } else { \
1181 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1182 } \
1183 } else { \
1184 uvc = (U32)*uc; \
1185 len = 1; \
1186 } \
1187} STMT_END
1188
1189
1190
1191#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1192 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1193 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1194 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1195 } \
1196 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1197 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1198 TRIE_LIST_CUR( state )++; \
1199} STMT_END
07be1b83 1200
786e8c11
YO
1201#define TRIE_LIST_NEW(state) STMT_START { \
1202 Newxz( trie->states[ state ].trans.list, \
1203 4, reg_trie_trans_le ); \
1204 TRIE_LIST_CUR( state ) = 1; \
1205 TRIE_LIST_LEN( state ) = 4; \
1206} STMT_END
07be1b83 1207
786e8c11
YO
1208#define TRIE_HANDLE_WORD(state) STMT_START { \
1209 U16 dupe= trie->states[ state ].wordnum; \
1210 regnode * const noper_next = regnext( noper ); \
1211 \
1212 if (trie->wordlen) \
1213 trie->wordlen[ curword ] = wordlen; \
1214 DEBUG_r({ \
1215 /* store the word for dumping */ \
1216 SV* tmp; \
1217 if (OP(noper) != NOTHING) \
1218 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1219 else \
1220 tmp = newSVpvn( "", 0 ); \
1221 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1222 av_push( trie_words, tmp ); \
786e8c11
YO
1223 }); \
1224 \
1225 curword++; \
1226 \
1227 if ( noper_next < tail ) { \
1228 if (!trie->jump) \
c944940b 1229 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1230 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1231 if (!jumper) \
1232 jumper = noper_next; \
1233 if (!nextbranch) \
1234 nextbranch= regnext(cur); \
1235 } \
1236 \
1237 if ( dupe ) { \
1238 /* So it's a dupe. This means we need to maintain a */\
1239 /* linked-list from the first to the next. */\
1240 /* we only allocate the nextword buffer when there */\
1241 /* a dupe, so first time we have to do the allocation */\
1242 if (!trie->nextword) \
c944940b 1243 trie->nextword = (U16 *) \
446bd890 1244 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1245 while ( trie->nextword[dupe] ) \
1246 dupe= trie->nextword[dupe]; \
1247 trie->nextword[dupe]= curword; \
1248 } else { \
1249 /* we haven't inserted this word yet. */ \
1250 trie->states[ state ].wordnum = curword; \
1251 } \
1252} STMT_END
07be1b83 1253
3dab1dad 1254
786e8c11
YO
1255#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1256 ( ( base + charid >= ucharcount \
1257 && base + charid < ubound \
1258 && state == trie->trans[ base - ucharcount + charid ].check \
1259 && trie->trans[ base - ucharcount + charid ].next ) \
1260 ? trie->trans[ base - ucharcount + charid ].next \
1261 : ( state==1 ? special : 0 ) \
1262 )
3dab1dad 1263
786e8c11
YO
1264#define MADE_TRIE 1
1265#define MADE_JUMP_TRIE 2
1266#define MADE_EXACT_TRIE 4
3dab1dad 1267
a3621e74 1268STATIC I32
786e8c11 1269S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1270{
27da23d5 1271 dVAR;
a3621e74
YO
1272 /* first pass, loop through and scan words */
1273 reg_trie_data *trie;
55eed653 1274 HV *widecharmap = NULL;
2b8b4781 1275 AV *revcharmap = newAV();
a3621e74 1276 regnode *cur;
9f7f3913 1277 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1278 STRLEN len = 0;
1279 UV uvc = 0;
1280 U16 curword = 0;
1281 U32 next_alloc = 0;
786e8c11
YO
1282 regnode *jumper = NULL;
1283 regnode *nextbranch = NULL;
7f69552c 1284 regnode *convert = NULL;
a3621e74 1285 /* we just use folder as a flag in utf8 */
e1ec3a88 1286 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1287 ? PL_fold
1288 : ( flags == EXACTFL
1289 ? PL_fold_locale
1290 : NULL
1291 )
1292 );
1293
2b8b4781
NC
1294#ifdef DEBUGGING
1295 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1296 AV *trie_words = NULL;
1297 /* along with revcharmap, this only used during construction but both are
1298 * useful during debugging so we store them in the struct when debugging.
8e11feef 1299 */
2b8b4781
NC
1300#else
1301 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1302 STRLEN trie_charcount=0;
3dab1dad 1303#endif
2b8b4781 1304 SV *re_trie_maxbuff;
a3621e74 1305 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1306#ifndef DEBUGGING
1307 PERL_UNUSED_ARG(depth);
1308#endif
a3621e74 1309
c944940b 1310 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1311 trie->refcount = 1;
3dab1dad 1312 trie->startstate = 1;
786e8c11 1313 trie->wordcount = word_count;
f8fc2ecf 1314 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1315 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1316 if (!(UTF && folder))
c944940b 1317 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1318 DEBUG_r({
2b8b4781 1319 trie_words = newAV();
a3621e74 1320 });
a3621e74 1321
0111c4fd 1322 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1323 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1324 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1325 }
3dab1dad
YO
1326 DEBUG_OPTIMISE_r({
1327 PerlIO_printf( Perl_debug_log,
786e8c11 1328 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1329 (int)depth * 2 + 2, "",
1330 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1331 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1332 (int)depth);
3dab1dad 1333 });
7f69552c
YO
1334
1335 /* Find the node we are going to overwrite */
1336 if ( first == startbranch && OP( last ) != BRANCH ) {
1337 /* whole branch chain */
1338 convert = first;
1339 } else {
1340 /* branch sub-chain */
1341 convert = NEXTOPER( first );
1342 }
1343
a3621e74
YO
1344 /* -- First loop and Setup --
1345
1346 We first traverse the branches and scan each word to determine if it
1347 contains widechars, and how many unique chars there are, this is
1348 important as we have to build a table with at least as many columns as we
1349 have unique chars.
1350
1351 We use an array of integers to represent the character codes 0..255
1352 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1353 native representation of the character value as the key and IV's for the
1354 coded index.
1355
1356 *TODO* If we keep track of how many times each character is used we can
1357 remap the columns so that the table compression later on is more
1358 efficient in terms of memory by ensuring most common value is in the
1359 middle and the least common are on the outside. IMO this would be better
1360 than a most to least common mapping as theres a decent chance the most
1361 common letter will share a node with the least common, meaning the node
1362 will not be compressable. With a middle is most common approach the worst
1363 case is when we have the least common nodes twice.
1364
1365 */
1366
a3621e74 1367 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1368 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1369 const U8 *uc = (U8*)STRING( noper );
a28509cc 1370 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1371 STRLEN foldlen = 0;
1372 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1373 const U8 *scan = (U8*)NULL;
07be1b83 1374 U32 wordlen = 0; /* required init */
3dab1dad 1375 STRLEN chars=0;
a3621e74 1376
3dab1dad
YO
1377 if (OP(noper) == NOTHING) {
1378 trie->minlen= 0;
1379 continue;
1380 }
1381 if (trie->bitmap) {
1382 TRIE_BITMAP_SET(trie,*uc);
1383 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1384 }
a3621e74 1385 for ( ; uc < e ; uc += len ) {
3dab1dad 1386 TRIE_CHARCOUNT(trie)++;
a3621e74 1387 TRIE_READ_CHAR;
3dab1dad 1388 chars++;
a3621e74
YO
1389 if ( uvc < 256 ) {
1390 if ( !trie->charmap[ uvc ] ) {
1391 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1392 if ( folder )
1393 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1394 TRIE_STORE_REVCHAR;
a3621e74
YO
1395 }
1396 } else {
1397 SV** svpp;
55eed653
NC
1398 if ( !widecharmap )
1399 widecharmap = newHV();
a3621e74 1400
55eed653 1401 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1402
1403 if ( !svpp )
e4584336 1404 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1405
1406 if ( !SvTRUE( *svpp ) ) {
1407 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1408 TRIE_STORE_REVCHAR;
a3621e74
YO
1409 }
1410 }
1411 }
3dab1dad
YO
1412 if( cur == first ) {
1413 trie->minlen=chars;
1414 trie->maxlen=chars;
1415 } else if (chars < trie->minlen) {
1416 trie->minlen=chars;
1417 } else if (chars > trie->maxlen) {
1418 trie->maxlen=chars;
1419 }
1420
a3621e74
YO
1421 } /* end first pass */
1422 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1423 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1424 (int)depth * 2 + 2,"",
55eed653 1425 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1426 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1427 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1428 );
c944940b 1429 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1430
1431 /*
1432 We now know what we are dealing with in terms of unique chars and
1433 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1434 representation using a flat table will take. If it's over a reasonable
1435 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1436 conservative but potentially much slower representation using an array
1437 of lists.
1438
1439 At the end we convert both representations into the same compressed
1440 form that will be used in regexec.c for matching with. The latter
1441 is a form that cannot be used to construct with but has memory
1442 properties similar to the list form and access properties similar
1443 to the table form making it both suitable for fast searches and
1444 small enough that its feasable to store for the duration of a program.
1445
1446 See the comment in the code where the compressed table is produced
1447 inplace from the flat tabe representation for an explanation of how
1448 the compression works.
1449
1450 */
1451
1452
3dab1dad 1453 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1454 /*
1455 Second Pass -- Array Of Lists Representation
1456
1457 Each state will be represented by a list of charid:state records
1458 (reg_trie_trans_le) the first such element holds the CUR and LEN
1459 points of the allocated array. (See defines above).
1460
1461 We build the initial structure using the lists, and then convert
1462 it into the compressed table form which allows faster lookups
1463 (but cant be modified once converted).
a3621e74
YO
1464 */
1465
a3621e74
YO
1466 STRLEN transcount = 1;
1467
1e2e3d02
YO
1468 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1469 "%*sCompiling trie using list compiler\n",
1470 (int)depth * 2 + 2, ""));
446bd890 1471
c944940b
JH
1472 trie->states = (reg_trie_state *)
1473 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1474 sizeof(reg_trie_state) );
a3621e74
YO
1475 TRIE_LIST_NEW(1);
1476 next_alloc = 2;
1477
1478 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1479
c445ea15
AL
1480 regnode * const noper = NEXTOPER( cur );
1481 U8 *uc = (U8*)STRING( noper );
1482 const U8 * const e = uc + STR_LEN( noper );
1483 U32 state = 1; /* required init */
1484 U16 charid = 0; /* sanity init */
1485 U8 *scan = (U8*)NULL; /* sanity init */
1486 STRLEN foldlen = 0; /* required init */
07be1b83 1487 U32 wordlen = 0; /* required init */
c445ea15
AL
1488 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1489
3dab1dad 1490 if (OP(noper) != NOTHING) {
786e8c11 1491 for ( ; uc < e ; uc += len ) {
c445ea15 1492
786e8c11 1493 TRIE_READ_CHAR;
c445ea15 1494
786e8c11
YO
1495 if ( uvc < 256 ) {
1496 charid = trie->charmap[ uvc ];
c445ea15 1497 } else {
55eed653 1498 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1499 if ( !svpp ) {
1500 charid = 0;
1501 } else {
1502 charid=(U16)SvIV( *svpp );
1503 }
c445ea15 1504 }
786e8c11
YO
1505 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1506 if ( charid ) {
a3621e74 1507
786e8c11
YO
1508 U16 check;
1509 U32 newstate = 0;
a3621e74 1510
786e8c11
YO
1511 charid--;
1512 if ( !trie->states[ state ].trans.list ) {
1513 TRIE_LIST_NEW( state );
c445ea15 1514 }
786e8c11
YO
1515 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1516 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1517 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1518 break;
1519 }
1520 }
1521 if ( ! newstate ) {
1522 newstate = next_alloc++;
1523 TRIE_LIST_PUSH( state, charid, newstate );
1524 transcount++;
1525 }
1526 state = newstate;
1527 } else {
1528 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1529 }
a28509cc 1530 }
c445ea15 1531 }
3dab1dad 1532 TRIE_HANDLE_WORD(state);
a3621e74
YO
1533
1534 } /* end second pass */
1535
1e2e3d02
YO
1536 /* next alloc is the NEXT state to be allocated */
1537 trie->statecount = next_alloc;
c944940b
JH
1538 trie->states = (reg_trie_state *)
1539 PerlMemShared_realloc( trie->states,
1540 next_alloc
1541 * sizeof(reg_trie_state) );
a3621e74 1542
3dab1dad 1543 /* and now dump it out before we compress it */
2b8b4781
NC
1544 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1545 revcharmap, next_alloc,
1546 depth+1)
1e2e3d02 1547 );
a3621e74 1548
c944940b
JH
1549 trie->trans = (reg_trie_trans *)
1550 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1551 {
1552 U32 state;
a3621e74
YO
1553 U32 tp = 0;
1554 U32 zp = 0;
1555
1556
1557 for( state=1 ; state < next_alloc ; state ++ ) {
1558 U32 base=0;
1559
1560 /*
1561 DEBUG_TRIE_COMPILE_MORE_r(
1562 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1563 );
1564 */
1565
1566 if (trie->states[state].trans.list) {
1567 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1568 U16 maxid=minid;
a28509cc 1569 U16 idx;
a3621e74
YO
1570
1571 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1572 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1573 if ( forid < minid ) {
1574 minid=forid;
1575 } else if ( forid > maxid ) {
1576 maxid=forid;
1577 }
a3621e74
YO
1578 }
1579 if ( transcount < tp + maxid - minid + 1) {
1580 transcount *= 2;
c944940b
JH
1581 trie->trans = (reg_trie_trans *)
1582 PerlMemShared_realloc( trie->trans,
446bd890
NC
1583 transcount
1584 * sizeof(reg_trie_trans) );
a3621e74
YO
1585 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1586 }
1587 base = trie->uniquecharcount + tp - minid;
1588 if ( maxid == minid ) {
1589 U32 set = 0;
1590 for ( ; zp < tp ; zp++ ) {
1591 if ( ! trie->trans[ zp ].next ) {
1592 base = trie->uniquecharcount + zp - minid;
1593 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1594 trie->trans[ zp ].check = state;
1595 set = 1;
1596 break;
1597 }
1598 }
1599 if ( !set ) {
1600 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1601 trie->trans[ tp ].check = state;
1602 tp++;
1603 zp = tp;
1604 }
1605 } else {
1606 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1607 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1608 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1609 trie->trans[ tid ].check = state;
1610 }
1611 tp += ( maxid - minid + 1 );
1612 }
1613 Safefree(trie->states[ state ].trans.list);
1614 }
1615 /*
1616 DEBUG_TRIE_COMPILE_MORE_r(
1617 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1618 );
1619 */
1620 trie->states[ state ].trans.base=base;
1621 }
cc601c31 1622 trie->lasttrans = tp + 1;
a3621e74
YO
1623 }
1624 } else {
1625 /*
1626 Second Pass -- Flat Table Representation.
1627
1628 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1629 We know that we will need Charcount+1 trans at most to store the data
1630 (one row per char at worst case) So we preallocate both structures
1631 assuming worst case.
1632
1633 We then construct the trie using only the .next slots of the entry
1634 structs.
1635
1636 We use the .check field of the first entry of the node temporarily to
1637 make compression both faster and easier by keeping track of how many non
1638 zero fields are in the node.
1639
1640 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1641 transition.
1642
1643 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1644 number representing the first entry of the node, and state as a
1645 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1646 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1647 are 2 entrys per node. eg:
1648
1649 A B A B
1650 1. 2 4 1. 3 7
1651 2. 0 3 3. 0 5
1652 3. 0 0 5. 0 0
1653 4. 0 0 7. 0 0
1654
1655 The table is internally in the right hand, idx form. However as we also
1656 have to deal with the states array which is indexed by nodenum we have to
1657 use TRIE_NODENUM() to convert.
1658
1659 */
1e2e3d02
YO
1660 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1661 "%*sCompiling trie using table compiler\n",
1662 (int)depth * 2 + 2, ""));
3dab1dad 1663
c944940b
JH
1664 trie->trans = (reg_trie_trans *)
1665 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1666 * trie->uniquecharcount + 1,
1667 sizeof(reg_trie_trans) );
1668 trie->states = (reg_trie_state *)
1669 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1670 sizeof(reg_trie_state) );
a3621e74
YO
1671 next_alloc = trie->uniquecharcount + 1;
1672
3dab1dad 1673
a3621e74
YO
1674 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1675
c445ea15 1676 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1677 const U8 *uc = (U8*)STRING( noper );
1678 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1679
1680 U32 state = 1; /* required init */
1681
1682 U16 charid = 0; /* sanity init */
1683 U32 accept_state = 0; /* sanity init */
1684 U8 *scan = (U8*)NULL; /* sanity init */
1685
1686 STRLEN foldlen = 0; /* required init */
07be1b83 1687 U32 wordlen = 0; /* required init */
a3621e74
YO
1688 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1689
3dab1dad 1690 if ( OP(noper) != NOTHING ) {
786e8c11 1691 for ( ; uc < e ; uc += len ) {
a3621e74 1692
786e8c11 1693 TRIE_READ_CHAR;
a3621e74 1694
786e8c11
YO
1695 if ( uvc < 256 ) {
1696 charid = trie->charmap[ uvc ];
1697 } else {
55eed653 1698 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1699 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1700 }
786e8c11
YO
1701 if ( charid ) {
1702 charid--;
1703 if ( !trie->trans[ state + charid ].next ) {
1704 trie->trans[ state + charid ].next = next_alloc;
1705 trie->trans[ state ].check++;
1706 next_alloc += trie->uniquecharcount;
1707 }
1708 state = trie->trans[ state + charid ].next;
1709 } else {
1710 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1711 }
1712 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1713 }
a3621e74 1714 }
3dab1dad
YO
1715 accept_state = TRIE_NODENUM( state );
1716 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1717
1718 } /* end second pass */
1719
3dab1dad 1720 /* and now dump it out before we compress it */
2b8b4781
NC
1721 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1722 revcharmap,
1723 next_alloc, depth+1));
a3621e74 1724
a3621e74
YO
1725 {
1726 /*
1727 * Inplace compress the table.*
1728
1729 For sparse data sets the table constructed by the trie algorithm will
1730 be mostly 0/FAIL transitions or to put it another way mostly empty.
1731 (Note that leaf nodes will not contain any transitions.)
1732
1733 This algorithm compresses the tables by eliminating most such
1734 transitions, at the cost of a modest bit of extra work during lookup:
1735
1736 - Each states[] entry contains a .base field which indicates the
1737 index in the state[] array wheres its transition data is stored.
1738
1739 - If .base is 0 there are no valid transitions from that node.
1740
1741 - If .base is nonzero then charid is added to it to find an entry in
1742 the trans array.
1743
1744 -If trans[states[state].base+charid].check!=state then the
1745 transition is taken to be a 0/Fail transition. Thus if there are fail
1746 transitions at the front of the node then the .base offset will point
1747 somewhere inside the previous nodes data (or maybe even into a node
1748 even earlier), but the .check field determines if the transition is
1749 valid.
1750
786e8c11 1751 XXX - wrong maybe?
a3621e74
YO
1752 The following process inplace converts the table to the compressed
1753 table: We first do not compress the root node 1,and mark its all its
1754 .check pointers as 1 and set its .base pointer as 1 as well. This
1755 allows to do a DFA construction from the compressed table later, and
1756 ensures that any .base pointers we calculate later are greater than
1757 0.
1758
1759 - We set 'pos' to indicate the first entry of the second node.
1760
1761 - We then iterate over the columns of the node, finding the first and
1762 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1763 and set the .check pointers accordingly, and advance pos
1764 appropriately and repreat for the next node. Note that when we copy
1765 the next pointers we have to convert them from the original
1766 NODEIDX form to NODENUM form as the former is not valid post
1767 compression.
1768
1769 - If a node has no transitions used we mark its base as 0 and do not
1770 advance the pos pointer.
1771
1772 - If a node only has one transition we use a second pointer into the
1773 structure to fill in allocated fail transitions from other states.
1774 This pointer is independent of the main pointer and scans forward
1775 looking for null transitions that are allocated to a state. When it
1776 finds one it writes the single transition into the "hole". If the
786e8c11 1777 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1778
1779 - Once compressed we can Renew/realloc the structures to release the
1780 excess space.
1781
1782 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1783 specifically Fig 3.47 and the associated pseudocode.
1784
1785 demq
1786 */
a3b680e6 1787 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1788 U32 state, charid;
a3621e74 1789 U32 pos = 0, zp=0;
1e2e3d02 1790 trie->statecount = laststate;
a3621e74
YO
1791
1792 for ( state = 1 ; state < laststate ; state++ ) {
1793 U8 flag = 0;
a28509cc
AL
1794 const U32 stateidx = TRIE_NODEIDX( state );
1795 const U32 o_used = trie->trans[ stateidx ].check;
1796 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1797 trie->trans[ stateidx ].check = 0;
1798
1799 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1800 if ( flag || trie->trans[ stateidx + charid ].next ) {
1801 if ( trie->trans[ stateidx + charid ].next ) {
1802 if (o_used == 1) {
1803 for ( ; zp < pos ; zp++ ) {
1804 if ( ! trie->trans[ zp ].next ) {
1805 break;
1806 }
1807 }
1808 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1809 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1810 trie->trans[ zp ].check = state;
1811 if ( ++zp > pos ) pos = zp;
1812 break;
1813 }
1814 used--;
1815 }
1816 if ( !flag ) {
1817 flag = 1;
1818 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1819 }
1820 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1821 trie->trans[ pos ].check = state;
1822 pos++;
1823 }
1824 }
1825 }
cc601c31 1826 trie->lasttrans = pos + 1;
c944940b
JH
1827 trie->states = (reg_trie_state *)
1828 PerlMemShared_realloc( trie->states, laststate
1829 * sizeof(reg_trie_state) );
a3621e74 1830 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1831 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1832 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1833 (int)depth * 2 + 2,"",
1834 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1835 (IV)next_alloc,
1836 (IV)pos,
a3621e74
YO
1837 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1838 );
1839
1840 } /* end table compress */
1841 }
1e2e3d02
YO
1842 DEBUG_TRIE_COMPILE_MORE_r(
1843 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1844 (int)depth * 2 + 2, "",
1845 (UV)trie->statecount,
1846 (UV)trie->lasttrans)
1847 );
cc601c31 1848 /* resize the trans array to remove unused space */
c944940b
JH
1849 trie->trans = (reg_trie_trans *)
1850 PerlMemShared_realloc( trie->trans, trie->lasttrans
1851 * sizeof(reg_trie_trans) );
a3621e74 1852
3dab1dad 1853 /* and now dump out the compressed format */
2b8b4781 1854 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1855
3dab1dad 1856 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1857 U8 nodetype =(U8)(flags & 0xFF);
1858 char *str=NULL;
786e8c11 1859
07be1b83 1860#ifdef DEBUGGING
e62cc96a 1861 regnode *optimize = NULL;
7122b237
YO
1862#ifdef RE_TRACK_PATTERN_OFFSETS
1863
b57a0404
JH
1864 U32 mjd_offset = 0;
1865 U32 mjd_nodelen = 0;
7122b237
YO
1866#endif /* RE_TRACK_PATTERN_OFFSETS */
1867#endif /* DEBUGGING */
a3621e74 1868 /*
3dab1dad
YO
1869 This means we convert either the first branch or the first Exact,
1870 depending on whether the thing following (in 'last') is a branch
1871 or not and whther first is the startbranch (ie is it a sub part of
1872 the alternation or is it the whole thing.)
1873 Assuming its a sub part we conver the EXACT otherwise we convert
1874 the whole branch sequence, including the first.
a3621e74 1875 */
3dab1dad 1876 /* Find the node we are going to overwrite */
7f69552c 1877 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1878 /* branch sub-chain */
3dab1dad 1879 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1880#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1881 DEBUG_r({
1882 mjd_offset= Node_Offset((convert));
1883 mjd_nodelen= Node_Length((convert));
1884 });
7122b237 1885#endif
7f69552c 1886 /* whole branch chain */
7122b237
YO
1887 }
1888#ifdef RE_TRACK_PATTERN_OFFSETS
1889 else {
7f69552c
YO
1890 DEBUG_r({
1891 const regnode *nop = NEXTOPER( convert );
1892 mjd_offset= Node_Offset((nop));
1893 mjd_nodelen= Node_Length((nop));
1894 });
07be1b83
YO
1895 }
1896 DEBUG_OPTIMISE_r(
1897 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1898 (int)depth * 2 + 2, "",
786e8c11 1899 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1900 );
7122b237 1901#endif
3dab1dad
YO
1902 /* But first we check to see if there is a common prefix we can
1903 split out as an EXACT and put in front of the TRIE node. */
1904 trie->startstate= 1;
55eed653 1905 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1906 U32 state;
1e2e3d02 1907 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1908 U32 ofs = 0;
8e11feef
RGS
1909 I32 idx = -1;
1910 U32 count = 0;
1911 const U32 base = trie->states[ state ].trans.base;
a3621e74 1912
3dab1dad 1913 if ( trie->states[state].wordnum )
8e11feef 1914 count = 1;
a3621e74 1915
8e11feef 1916 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1917 if ( ( base + ofs >= trie->uniquecharcount ) &&
1918 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1919 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1920 {
3dab1dad 1921 if ( ++count > 1 ) {
2b8b4781 1922 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1923 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1924 if ( state == 1 ) break;
3dab1dad
YO
1925 if ( count == 2 ) {
1926 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1927 DEBUG_OPTIMISE_r(
8e11feef
RGS
1928 PerlIO_printf(Perl_debug_log,
1929 "%*sNew Start State=%"UVuf" Class: [",
1930 (int)depth * 2 + 2, "",
786e8c11 1931 (UV)state));
be8e71aa 1932 if (idx >= 0) {
2b8b4781 1933 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1934 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1935
3dab1dad 1936 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1937 if ( folder )
1938 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1939 DEBUG_OPTIMISE_r(
07be1b83 1940 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1941 );
8e11feef
RGS
1942 }
1943 }
1944 TRIE_BITMAP_SET(trie,*ch);
1945 if ( folder )
1946 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1947 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1948 }
1949 idx = ofs;
1950 }
3dab1dad
YO
1951 }
1952 if ( count == 1 ) {
2b8b4781 1953 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1954 char *ch = SvPV_nolen( *tmp );
1955 DEBUG_OPTIMISE_r({
1956 SV *sv=sv_newmortal();
8e11feef
RGS
1957 PerlIO_printf( Perl_debug_log,
1958 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1959 (int)depth * 2 + 2, "",
de734bd5
A
1960 (UV)state, (UV)idx,
1961 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1962 PL_colors[0], PL_colors[1],
1963 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1964 PERL_PV_ESCAPE_FIRSTCHAR
1965 )
1966 );
1967 });
3dab1dad
YO
1968 if ( state==1 ) {
1969 OP( convert ) = nodetype;
1970 str=STRING(convert);
1971 STR_LEN(convert)=0;
1972 }
de734bd5
A
1973 while (*ch) {
1974 *str++ = *ch++;
1975 STR_LEN(convert)++;
1976 }
1977
8e11feef 1978 } else {
f9049ba1 1979#ifdef DEBUGGING
8e11feef
RGS
1980 if (state>1)
1981 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1982#endif
8e11feef
RGS
1983 break;
1984 }
1985 }
3dab1dad 1986 if (str) {
8e11feef 1987 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1988 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1989 trie->startstate = state;
07be1b83
YO
1990 trie->minlen -= (state - 1);
1991 trie->maxlen -= (state - 1);
1992 DEBUG_r({
1993 regnode *fix = convert;
de734bd5 1994 U32 word = trie->wordcount;
07be1b83
YO
1995 mjd_nodelen++;
1996 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1997 while( ++fix < n ) {
1998 Set_Node_Offset_Length(fix, 0, 0);
1999 }
de734bd5 2000 while (word--) {
2b8b4781 2001 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
2002 if (tmp) {
2003 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2004 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2005 else
2006 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2007 }
2008 }
07be1b83 2009 });
8e11feef
RGS
2010 if (trie->maxlen) {
2011 convert = n;
2012 } else {
3dab1dad 2013 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2014 DEBUG_r(optimize= n);
3dab1dad
YO
2015 }
2016 }
2017 }
a5ca303d
YO
2018 if (!jumper)
2019 jumper = last;
3dab1dad 2020 if ( trie->maxlen ) {
8e11feef
RGS
2021 NEXT_OFF( convert ) = (U16)(tail - convert);
2022 ARG_SET( convert, data_slot );
786e8c11
YO
2023 /* Store the offset to the first unabsorbed branch in
2024 jump[0], which is otherwise unused by the jump logic.
2025 We use this when dumping a trie and during optimisation. */
2026 if (trie->jump)
7f69552c 2027 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2028
786e8c11
YO
2029 /* XXXX */
2030 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2031 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2032 {
2033 OP( convert ) = TRIEC;
2034 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2035 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2036 trie->bitmap= NULL;
2037 } else
2038 OP( convert ) = TRIE;
a3621e74 2039
3dab1dad
YO
2040 /* store the type in the flags */
2041 convert->flags = nodetype;
a5ca303d
YO
2042 DEBUG_r({
2043 optimize = convert
2044 + NODE_STEP_REGNODE
2045 + regarglen[ OP( convert ) ];
2046 });
2047 /* XXX We really should free up the resource in trie now,
2048 as we won't use them - (which resources?) dmq */
3dab1dad 2049 }
a3621e74 2050 /* needed for dumping*/
e62cc96a 2051 DEBUG_r(if (optimize) {
07be1b83 2052 regnode *opt = convert;
bcdf7404 2053
e62cc96a 2054 while ( ++opt < optimize) {
07be1b83
YO
2055 Set_Node_Offset_Length(opt,0,0);
2056 }
786e8c11
YO
2057 /*
2058 Try to clean up some of the debris left after the
2059 optimisation.
a3621e74 2060 */
786e8c11 2061 while( optimize < jumper ) {
07be1b83 2062 mjd_nodelen += Node_Length((optimize));
a3621e74 2063 OP( optimize ) = OPTIMIZED;
07be1b83 2064 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2065 optimize++;
2066 }
07be1b83 2067 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2068 });
2069 } /* end node insert */
55eed653 2070 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2071#ifdef DEBUGGING
2072 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2073 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2074#else
2075 SvREFCNT_dec(revcharmap);
07be1b83 2076#endif
786e8c11
YO
2077 return trie->jump
2078 ? MADE_JUMP_TRIE
2079 : trie->startstate>1
2080 ? MADE_EXACT_TRIE
2081 : MADE_TRIE;
2082}
2083
2084STATIC void
2085S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2086{
2087/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2088
2089 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2090 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2091 ISBN 0-201-10088-6
2092
2093 We find the fail state for each state in the trie, this state is the longest proper
2094 suffix of the current states 'word' that is also a proper prefix of another word in our
2095 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2096 the DFA not to have to restart after its tried and failed a word at a given point, it
2097 simply continues as though it had been matching the other word in the first place.
2098 Consider
2099 'abcdgu'=~/abcdefg|cdgu/
2100 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2101 fail, which would bring use to the state representing 'd' in the second word where we would
2102 try 'g' and succeed, prodceding to match 'cdgu'.
2103 */
2104 /* add a fail transition */
3251b653
NC
2105 const U32 trie_offset = ARG(source);
2106 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2107 U32 *q;
2108 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2109 const U32 numstates = trie->statecount;
786e8c11
YO
2110 const U32 ubound = trie->lasttrans + ucharcount;
2111 U32 q_read = 0;
2112 U32 q_write = 0;
2113 U32 charid;
2114 U32 base = trie->states[ 1 ].trans.base;
2115 U32 *fail;
2116 reg_ac_data *aho;
2117 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2118 GET_RE_DEBUG_FLAGS_DECL;
2119#ifndef DEBUGGING
2120 PERL_UNUSED_ARG(depth);
2121#endif
2122
2123
2124 ARG_SET( stclass, data_slot );
c944940b 2125 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2126 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2127 aho->trie=trie_offset;
446bd890
NC
2128 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2129 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2130 Newxz( q, numstates, U32);
c944940b 2131 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2132 aho->refcount = 1;
2133 fail = aho->fail;
2134 /* initialize fail[0..1] to be 1 so that we always have
2135 a valid final fail state */
2136 fail[ 0 ] = fail[ 1 ] = 1;
2137
2138 for ( charid = 0; charid < ucharcount ; charid++ ) {
2139 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2140 if ( newstate ) {
2141 q[ q_write ] = newstate;
2142 /* set to point at the root */
2143 fail[ q[ q_write++ ] ]=1;
2144 }
2145 }
2146 while ( q_read < q_write) {
2147 const U32 cur = q[ q_read++ % numstates ];
2148 base = trie->states[ cur ].trans.base;
2149
2150 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2151 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2152 if (ch_state) {
2153 U32 fail_state = cur;
2154 U32 fail_base;
2155 do {
2156 fail_state = fail[ fail_state ];
2157 fail_base = aho->states[ fail_state ].trans.base;
2158 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2159
2160 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2161 fail[ ch_state ] = fail_state;
2162 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2163 {
2164 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2165 }
2166 q[ q_write++ % numstates] = ch_state;
2167 }
2168 }
2169 }
2170 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2171 when we fail in state 1, this allows us to use the
2172 charclass scan to find a valid start char. This is based on the principle
2173 that theres a good chance the string being searched contains lots of stuff
2174 that cant be a start char.
2175 */
2176 fail[ 0 ] = fail[ 1 ] = 0;
2177 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2178 PerlIO_printf(Perl_debug_log,
2179 "%*sStclass Failtable (%"UVuf" states): 0",
2180 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2181 );
786e8c11
YO
2182 for( q_read=1; q_read<numstates; q_read++ ) {
2183 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2184 }
2185 PerlIO_printf(Perl_debug_log, "\n");
2186 });
2187 Safefree(q);
2188 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2189}
2190
786e8c11 2191
a3621e74 2192/*
5d1c421c
JH
2193 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2194 * These need to be revisited when a newer toolchain becomes available.
2195 */
2196#if defined(__sparc64__) && defined(__GNUC__)
2197# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2198# undef SPARC64_GCC_WORKAROUND
2199# define SPARC64_GCC_WORKAROUND 1
2200# endif
2201#endif
2202
07be1b83 2203#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2204 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2205 SV * const mysv=sv_newmortal(); \
2206 regnode *Next = regnext(scan); \
2207 regprop(RExC_rx, mysv, scan); \
7f69552c 2208 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2209 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2210 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2211 }});
07be1b83 2212
1de06328
YO
2213
2214
2215
2216
07be1b83
YO
2217#define JOIN_EXACT(scan,min,flags) \
2218 if (PL_regkind[OP(scan)] == EXACT) \
2219 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2220
be8e71aa 2221STATIC U32
07be1b83
YO
2222S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2223 /* Merge several consecutive EXACTish nodes into one. */
2224 regnode *n = regnext(scan);
2225 U32 stringok = 1;
2226 regnode *next = scan + NODE_SZ_STR(scan);
2227 U32 merged = 0;
2228 U32 stopnow = 0;
2229#ifdef DEBUGGING
2230 regnode *stop = scan;
72f13be8 2231 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2232#else
d47053eb
RGS
2233 PERL_UNUSED_ARG(depth);
2234#endif
2235#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2236 PERL_UNUSED_ARG(flags);
2237 PERL_UNUSED_ARG(val);
07be1b83 2238#endif
07be1b83
YO
2239 DEBUG_PEEP("join",scan,depth);
2240
2241 /* Skip NOTHING, merge EXACT*. */
2242 while (n &&
2243 ( PL_regkind[OP(n)] == NOTHING ||
2244 (stringok && (OP(n) == OP(scan))))
2245 && NEXT_OFF(n)
2246 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2247
2248 if (OP(n) == TAIL || n > next)
2249 stringok = 0;
2250 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2251 DEBUG_PEEP("skip:",n,depth);
2252 NEXT_OFF(scan) += NEXT_OFF(n);
2253 next = n + NODE_STEP_REGNODE;
2254#ifdef DEBUGGING
2255 if (stringok)
2256 stop = n;
2257#endif
2258 n = regnext(n);
2259 }
2260 else if (stringok) {
786e8c11 2261 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2262 regnode * const nnext = regnext(n);
2263
2264 DEBUG_PEEP("merg",n,depth);
2265
2266 merged++;
2267 if (oldl + STR_LEN(n) > U8_MAX)
2268 break;
2269 NEXT_OFF(scan) += NEXT_OFF(n);
2270 STR_LEN(scan) += STR_LEN(n);
2271 next = n + NODE_SZ_STR(n);
2272 /* Now we can overwrite *n : */
2273 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2274#ifdef DEBUGGING
2275 stop = next - 1;
2276#endif
2277 n = nnext;
2278 if (stopnow) break;
2279 }
2280
d47053eb
RGS
2281#ifdef EXPERIMENTAL_INPLACESCAN
2282 if (flags && !NEXT_OFF(n)) {
2283 DEBUG_PEEP("atch", val, depth);
2284 if (reg_off_by_arg[OP(n)]) {
2285 ARG_SET(n, val - n);
2286 }
2287 else {
2288 NEXT_OFF(n) = val - n;
2289 }
2290 stopnow = 1;
2291 }
07be1b83
YO
2292#endif
2293 }
2294
2295 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2296 /*
2297 Two problematic code points in Unicode casefolding of EXACT nodes:
2298
2299 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2300 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2301
2302 which casefold to
2303
2304 Unicode UTF-8
2305
2306 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2307 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2308
2309 This means that in case-insensitive matching (or "loose matching",
2310 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2311 length of the above casefolded versions) can match a target string
2312 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2313 This would rather mess up the minimum length computation.
2314
2315 What we'll do is to look for the tail four bytes, and then peek
2316 at the preceding two bytes to see whether we need to decrease
2317 the minimum length by four (six minus two).
2318
2319 Thanks to the design of UTF-8, there cannot be false matches:
2320 A sequence of valid UTF-8 bytes cannot be a subsequence of
2321 another valid sequence of UTF-8 bytes.
2322
2323 */
2324 char * const s0 = STRING(scan), *s, *t;
2325 char * const s1 = s0 + STR_LEN(scan) - 1;
2326 char * const s2 = s1 - 4;
e294cc5d
JH
2327#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2328 const char t0[] = "\xaf\x49\xaf\x42";
2329#else
07be1b83 2330 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2331#endif
07be1b83
YO
2332 const char * const t1 = t0 + 3;
2333
2334 for (s = s0 + 2;
2335 s < s2 && (t = ninstr(s, s1, t0, t1));
2336 s = t + 4) {
e294cc5d
JH
2337#ifdef EBCDIC
2338 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2339 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2340#else
07be1b83
YO
2341 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2342 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2343#endif
07be1b83
YO
2344 *min -= 4;
2345 }
2346 }
2347
2348#ifdef DEBUGGING
2349 /* Allow dumping */
2350 n = scan + NODE_SZ_STR(scan);
2351 while (n <= stop) {
2352 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2353 OP(n) = OPTIMIZED;
2354 NEXT_OFF(n) = 0;
2355 }
2356 n++;
2357 }
2358#endif
2359 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2360 return stopnow;
2361}
2362
653099ff
GS
2363/* REx optimizer. Converts nodes into quickier variants "in place".
2364 Finds fixed substrings. */
2365
a0288114 2366/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2367 to the position after last scanned or to NULL. */
2368
40d049e4
YO
2369#define INIT_AND_WITHP \
2370 assert(!and_withp); \
2371 Newx(and_withp,1,struct regnode_charclass_class); \
2372 SAVEFREEPV(and_withp)
07be1b83 2373
b515a41d
YO
2374/* this is a chain of data about sub patterns we are processing that
2375 need to be handled seperately/specially in study_chunk. Its so
2376 we can simulate recursion without losing state. */
2377struct scan_frame;
2378typedef struct scan_frame {
2379 regnode *last; /* last node to process in this frame */
2380 regnode *next; /* next node to process when last is reached */
2381 struct scan_frame *prev; /*previous frame*/
2382 I32 stop; /* what stopparen do we use */
2383} scan_frame;
2384
304ee84b
YO
2385
2386#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2387
76e3520e 2388STATIC I32
40d049e4 2389S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2390 I32 *minlenp, I32 *deltap,
40d049e4
YO
2391 regnode *last,
2392 scan_data_t *data,
2393 I32 stopparen,
2394 U8* recursed,
2395 struct regnode_charclass_class *and_withp,
2396 U32 flags, U32 depth)
c277df42
IZ
2397 /* scanp: Start here (read-write). */
2398 /* deltap: Write maxlen-minlen here. */
2399 /* last: Stop before this one. */
40d049e4
YO
2400 /* data: string data about the pattern */
2401 /* stopparen: treat close N as END */
2402 /* recursed: which subroutines have we recursed into */
2403 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2404{
97aff369 2405 dVAR;
c277df42
IZ
2406 I32 min = 0, pars = 0, code;
2407 regnode *scan = *scanp, *next;
2408 I32 delta = 0;
2409 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2410 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2411 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2412 scan_data_t data_fake;
a3621e74 2413 SV *re_trie_maxbuff = NULL;
786e8c11 2414 regnode *first_non_open = scan;
e2e6a0f1 2415 I32 stopmin = I32_MAX;
8aa23a47
YO
2416 scan_frame *frame = NULL;
2417
a3621e74 2418 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2419
13a24bad 2420#ifdef DEBUGGING
40d049e4 2421 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2422#endif
40d049e4 2423
786e8c11 2424 if ( depth == 0 ) {
40d049e4 2425 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2426 first_non_open=regnext(first_non_open);
2427 }
2428
b81d288d 2429
8aa23a47
YO
2430 fake_study_recurse:
2431 while ( scan && OP(scan) != END && scan < last ){
2432 /* Peephole optimizer: */
304ee84b 2433 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2434 DEBUG_PEEP("Peep",scan,depth);
2435 JOIN_EXACT(scan,&min,0);
2436
2437 /* Follow the next-chain of the current node and optimize
2438 away all the NOTHINGs from it. */
2439 if (OP(scan) != CURLYX) {
2440 const int max = (reg_off_by_arg[OP(scan)]
2441 ? I32_MAX
2442 /* I32 may be smaller than U16 on CRAYs! */
2443 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2444 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2445 int noff;
2446 regnode *n = scan;
2447
2448 /* Skip NOTHING and LONGJMP. */
2449 while ((n = regnext(n))
2450 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2451 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2452 && off + noff < max)
2453 off += noff;
2454 if (reg_off_by_arg[OP(scan)])
2455 ARG(scan) = off;
2456 else
2457 NEXT_OFF(scan) = off;
2458 }
a3621e74 2459
c277df42 2460
8aa23a47
YO
2461
2462 /* The principal pseudo-switch. Cannot be a switch, since we
2463 look into several different things. */
2464 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2465 || OP(scan) == IFTHEN) {
2466 next = regnext(scan);
2467 code = OP(scan);
2468 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2469
2470 if (OP(next) == code || code == IFTHEN) {
2471 /* NOTE - There is similar code to this block below for handling
2472 TRIE nodes on a re-study. If you change stuff here check there
2473 too. */
2474 I32 max1 = 0, min1 = I32_MAX, num = 0;
2475 struct regnode_charclass_class accum;
2476 regnode * const startbranch=scan;
2477
2478 if (flags & SCF_DO_SUBSTR)
304ee84b 2479 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2480 if (flags & SCF_DO_STCLASS)
2481 cl_init_zero(pRExC_state, &accum);
2482
2483 while (OP(scan) == code) {
2484 I32 deltanext, minnext, f = 0, fake;
2485 struct regnode_charclass_class this_class;
2486
2487 num++;
2488 data_fake.flags = 0;
2489 if (data) {
2490 data_fake.whilem_c = data->whilem_c;
2491 data_fake.last_closep = data->last_closep;
2492 }
2493 else
2494 data_fake.last_closep = &fake;
58e23c8d
YO
2495
2496 data_fake.pos_delta = delta;
8aa23a47
YO
2497 next = regnext(scan);
2498 scan = NEXTOPER(scan);
2499 if (code != BRANCH)
c277df42 2500 scan = NEXTOPER(scan);
8aa23a47
YO
2501 if (flags & SCF_DO_STCLASS) {
2502 cl_init(pRExC_state, &this_class);
2503 data_fake.start_class = &this_class;
2504 f = SCF_DO_STCLASS_AND;
58e23c8d 2505 }
8aa23a47
YO
2506 if (flags & SCF_WHILEM_VISITED_POS)
2507 f |= SCF_WHILEM_VISITED_POS;
2508
2509 /* we suppose the run is continuous, last=next...*/
2510 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2511 next, &data_fake,
2512 stopparen, recursed, NULL, f,depth+1);
2513 if (min1 > minnext)
2514 min1 = minnext;
2515 if (max1 < minnext + deltanext)
2516 max1 = minnext + deltanext;
2517 if (deltanext == I32_MAX)
2518 is_inf = is_inf_internal = 1;
2519 scan = next;
2520 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2521 pars++;
2522 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2523 if ( stopmin > minnext)
2524 stopmin = min + min1;
2525 flags &= ~SCF_DO_SUBSTR;
2526 if (data)
2527 data->flags |= SCF_SEEN_ACCEPT;
2528 }
2529 if (data) {
2530 if (data_fake.flags & SF_HAS_EVAL)
2531 data->flags |= SF_HAS_EVAL;
2532 data->whilem_c = data_fake.whilem_c;
3dab1dad 2533 }
8aa23a47
YO
2534 if (flags & SCF_DO_STCLASS)
2535 cl_or(pRExC_state, &accum, &this_class);
2536 }
2537 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2538 min1 = 0;
2539 if (flags & SCF_DO_SUBSTR) {
2540 data->pos_min += min1;
2541 data->pos_delta += max1 - min1;
2542 if (max1 != min1 || is_inf)
2543 data->longest = &(data->longest_float);
2544 }
2545 min += min1;
2546 delta += max1 - min1;
2547 if (flags & SCF_DO_STCLASS_OR) {
2548 cl_or(pRExC_state, data->start_class, &accum);
2549 if (min1) {
2550 cl_and(data->start_class, and_withp);
2551 flags &= ~SCF_DO_STCLASS;
653099ff 2552 }
8aa23a47
YO
2553 }
2554 else if (flags & SCF_DO_STCLASS_AND) {
2555 if (min1) {
2556 cl_and(data->start_class, &accum);
2557 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2558 }
8aa23a47
YO
2559 else {
2560 /* Switch to OR mode: cache the old value of
2561 * data->start_class */
2562 INIT_AND_WITHP;
2563 StructCopy(data->start_class, and_withp,
2564 struct regnode_charclass_class);
2565 flags &= ~SCF_DO_STCLASS_AND;
2566 StructCopy(&accum, data->start_class,
2567 struct regnode_charclass_class);
2568 flags |= SCF_DO_STCLASS_OR;
2569 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2570 }
8aa23a47 2571 }
a3621e74 2572
8aa23a47
YO
2573 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2574 /* demq.
a3621e74 2575
8aa23a47
YO
2576 Assuming this was/is a branch we are dealing with: 'scan' now
2577 points at the item that follows the branch sequence, whatever
2578 it is. We now start at the beginning of the sequence and look
2579 for subsequences of
a3621e74 2580
8aa23a47
YO
2581 BRANCH->EXACT=>x1
2582 BRANCH->EXACT=>x2
2583 tail
a3621e74 2584
8aa23a47 2585 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2586
8aa23a47
YO
2587 If we can find such a subseqence we need to turn the first
2588 element into a trie and then add the subsequent branch exact
2589 strings to the trie.
a3621e74 2590
8aa23a47 2591 We have two cases
a3621e74 2592
8aa23a47 2593 1. patterns where the whole set of branch can be converted.
a3621e74 2594
8aa23a47 2595 2. patterns where only a subset can be converted.
a3621e74 2596
8aa23a47
YO
2597 In case 1 we can replace the whole set with a single regop
2598 for the trie. In case 2 we need to keep the start and end
2599 branchs so
a3621e74 2600
8aa23a47
YO
2601 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2602 becomes BRANCH TRIE; BRANCH X;
786e8c11 2603
8aa23a47
YO
2604 There is an additional case, that being where there is a
2605 common prefix, which gets split out into an EXACT like node
2606 preceding the TRIE node.
a3621e74 2607
8aa23a47
YO
2608 If x(1..n)==tail then we can do a simple trie, if not we make
2609 a "jump" trie, such that when we match the appropriate word
2610 we "jump" to the appopriate tail node. Essentailly we turn
2611 a nested if into a case structure of sorts.
b515a41d 2612
8aa23a47
YO
2613 */
2614
2615 int made=0;
2616 if (!re_trie_maxbuff) {
2617 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2618 if (!SvIOK(re_trie_maxbuff))
2619 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2620 }
2621 if ( SvIV(re_trie_maxbuff)>=0 ) {
2622 regnode *cur;
2623 regnode *first = (regnode *)NULL;
2624 regnode *last = (regnode *)NULL;
2625 regnode *tail = scan;
2626 U8 optype = 0;
2627 U32 count=0;
a3621e74
YO
2628
2629#ifdef DEBUGGING
8aa23a47 2630 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2631#endif
8aa23a47
YO
2632 /* var tail is used because there may be a TAIL
2633 regop in the way. Ie, the exacts will point to the
2634 thing following the TAIL, but the last branch will
2635 point at the TAIL. So we advance tail. If we
2636 have nested (?:) we may have to move through several
2637 tails.
2638 */
2639
2640 while ( OP( tail ) == TAIL ) {
2641 /* this is the TAIL generated by (?:) */
2642 tail = regnext( tail );
2643 }
a3621e74 2644
8aa23a47
YO
2645
2646 DEBUG_OPTIMISE_r({
2647 regprop(RExC_rx, mysv, tail );
2648 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2649 (int)depth * 2 + 2, "",
2650 "Looking for TRIE'able sequences. Tail node is: ",
2651 SvPV_nolen_const( mysv )
2652 );
2653 });
2654
2655 /*
2656
2657 step through the branches, cur represents each
2658 branch, noper is the first thing to be matched
2659 as part of that branch and noper_next is the
2660 regnext() of that node. if noper is an EXACT
2661 and noper_next is the same as scan (our current
2662 position in the regex) then the EXACT branch is
2663 a possible optimization target. Once we have
2664 two or more consequetive such branches we can
2665 create a trie of the EXACT's contents and stich
2666 it in place. If the sequence represents all of
2667 the branches we eliminate the whole thing and
2668 replace it with a single TRIE. If it is a
2669 subsequence then we need to stitch it in. This
2670 means the first branch has to remain, and needs
2671 to be repointed at the item on the branch chain
2672 following the last branch optimized. This could
2673 be either a BRANCH, in which case the
2674 subsequence is internal, or it could be the
2675 item following the branch sequence in which
2676 case the subsequence is at the end.
2677
2678 */
2679
2680 /* dont use tail as the end marker for this traverse */
2681 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2682 regnode * const noper = NEXTOPER( cur );
b515a41d 2683#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2684 regnode * const noper_next = regnext( noper );
b515a41d
YO
2685#endif
2686
8aa23a47
YO
2687 DEBUG_OPTIMISE_r({
2688 regprop(RExC_rx, mysv, cur);
2689 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2690 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2691
2692 regprop(RExC_rx, mysv, noper);
2693 PerlIO_printf( Perl_debug_log, " -> %s",
2694 SvPV_nolen_const(mysv));
2695
2696 if ( noper_next ) {
2697 regprop(RExC_rx, mysv, noper_next );
2698 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2699 SvPV_nolen_const(mysv));
2700 }
2701 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2702 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2703 });
2704 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2705 : PL_regkind[ OP( noper ) ] == EXACT )
2706 || OP(noper) == NOTHING )
786e8c11 2707#ifdef NOJUMPTRIE
8aa23a47 2708 && noper_next == tail
786e8c11 2709#endif
8aa23a47
YO
2710 && count < U16_MAX)
2711 {
2712 count++;
2713 if ( !first || optype == NOTHING ) {
2714 if (!first) first = cur;
2715 optype = OP( noper );
2716 } else {
2717 last = cur;
2718 }
2719 } else {
2720 if ( last ) {
2721 make_trie( pRExC_state,
2722 startbranch, first, cur, tail, count,
2723 optype, depth+1 );
2724 }
2725 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2726#ifdef NOJUMPTRIE
8aa23a47 2727 && noper_next == tail
786e8c11 2728#endif
8aa23a47
YO
2729 ){
2730 count = 1;
2731 first = cur;
2732 optype = OP( noper );
2733 } else {
2734 count = 0;
2735 first = NULL;
2736 optype = 0;
2737 }
2738 last = NULL;
2739 }
2740 }
2741 DEBUG_OPTIMISE_r({
2742 regprop(RExC_rx, mysv, cur);
2743 PerlIO_printf( Perl_debug_log,
2744 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2745 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2746
2747 });
2748 if ( last ) {
2749 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2750#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2751 if ( ((made == MADE_EXACT_TRIE &&
2752 startbranch == first)
2753 || ( first_non_open == first )) &&
2754 depth==0 ) {
2755 flags |= SCF_TRIE_RESTUDY;
2756 if ( startbranch == first
2757 && scan == tail )
2758 {
2759 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2760 }
2761 }
3dab1dad 2762#endif
8aa23a47
YO
2763 }
2764 }
2765
2766 } /* do trie */
2767
653099ff 2768 }
8aa23a47
YO
2769 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2770 scan = NEXTOPER(NEXTOPER(scan));
2771 } else /* single branch is optimized. */
2772 scan = NEXTOPER(scan);
2773 continue;
2774 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2775 scan_frame *newframe = NULL;
2776 I32 paren;
2777 regnode *start;
2778 regnode *end;
2779
2780 if (OP(scan) != SUSPEND) {
2781 /* set the pointer */
2782 if (OP(scan) == GOSUB) {
2783 paren = ARG(scan);
2784 RExC_recurse[ARG2L(scan)] = scan;
2785 start = RExC_open_parens[paren-1];
2786 end = RExC_close_parens[paren-1];
2787 } else {
2788 paren = 0;
f8fc2ecf 2789 start = RExC_rxi->program + 1;
8aa23a47
YO
2790 end = RExC_opend;
2791 }
2792 if (!recursed) {
2793 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2794 SAVEFREEPV(recursed);
2795 }
2796 if (!PAREN_TEST(recursed,paren+1)) {
2797 PAREN_SET(recursed,paren+1);
2798 Newx(newframe,1,scan_frame);
2799 } else {
2800 if (flags & SCF_DO_SUBSTR) {
304ee84b 2801 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2802 data->longest = &(data->longest_float);
2803 }
2804 is_inf = is_inf_internal = 1;
2805 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2806 cl_anything(pRExC_state, data->start_class);
2807 flags &= ~SCF_DO_STCLASS;
2808 }
2809 } else {
2810 Newx(newframe,1,scan_frame);
2811 paren = stopparen;
2812 start = scan+2;
2813 end = regnext(scan);
2814 }
2815 if (newframe) {
2816 assert(start);
2817 assert(end);
2818 SAVEFREEPV(newframe);
2819 newframe->next = regnext(scan);
2820 newframe->last = last;
2821 newframe->stop = stopparen;
2822 newframe->prev = frame;
2823
2824 frame = newframe;
2825 scan = start;
2826 stopparen = paren;
2827 last = end;
2828
2829 continue;
2830 }
2831 }
2832 else if (OP(scan) == EXACT) {
2833 I32 l = STR_LEN(scan);
2834 UV uc;
2835 if (UTF) {
2836 const U8 * const s = (U8*)STRING(scan);
2837 l = utf8_length(s, s + l);
2838 uc = utf8_to_uvchr(s, NULL);
2839 } else {
2840 uc = *((U8*)STRING(scan));
2841 }
2842 min += l;
2843 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2844 /* The code below prefers earlier match for fixed
2845 offset, later match for variable offset. */
2846 if (data->last_end == -1) { /* Update the start info. */
2847 data->last_start_min = data->pos_min;
2848 data->last_start_max = is_inf
2849 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2850 }
8aa23a47
YO
2851 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2852 if (UTF)
2853 SvUTF8_on(data->last_found);
2854 {
2855 SV * const sv = data->last_found;
2856 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2857 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2858 if (mg && mg->mg_len >= 0)
2859 mg->mg_len += utf8_length((U8*)STRING(scan),
2860 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2861 }
8aa23a47
YO
2862 data->last_end = data->pos_min + l;
2863 data->pos_min += l; /* As in the first entry. */
2864 data->flags &= ~SF_BEFORE_EOL;
2865 }
2866 if (flags & SCF_DO_STCLASS_AND) {
2867 /* Check whether it is compatible with what we know already! */
2868 int compat = 1;
2869
2870 if (uc >= 0x100 ||
2871 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2872 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2873 && (!(data->start_class->flags & ANYOF_FOLD)
2874 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2875 )
2876 compat = 0;
2877 ANYOF_CLASS_ZERO(data->start_class);
2878 ANYOF_BITMAP_ZERO(data->start_class);
2879 if (compat)
2880 ANYOF_BITMAP_SET(data->start_class, uc);
2881 data->start_class->flags &= ~ANYOF_EOS;
2882 if (uc < 0x100)
2883 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2884 }
2885 else if (flags & SCF_DO_STCLASS_OR) {
2886 /* false positive possible if the class is case-folded */
2887 if (uc < 0x100)
2888 ANYOF_BITMAP_SET(data->start_class, uc);
2889 else
2890 data->start_class->flags |= ANYOF_UNICODE_ALL;
2891 data->start_class->flags &= ~ANYOF_EOS;
2892 cl_and(data->start_class, and_withp);
2893 }
2894 flags &= ~SCF_DO_STCLASS;
2895 }
2896 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2897 I32 l = STR_LEN(scan);
2898 UV uc = *((U8*)STRING(scan));
2899
2900 /* Search for fixed substrings supports EXACT only. */
2901 if (flags & SCF_DO_SUBSTR) {
2902 assert(data);
304ee84b 2903 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2904 }
2905 if (UTF) {
2906 const U8 * const s = (U8 *)STRING(scan);
2907 l = utf8_length(s, s + l);
2908 uc = utf8_to_uvchr(s, NULL);
2909 }
2910 min += l;
2911 if (flags & SCF_DO_SUBSTR)
2912 data->pos_min += l;
2913 if (flags & SCF_DO_STCLASS_AND) {
2914 /* Check whether it is compatible with what we know already! */
2915 int compat = 1;
2916
2917 if (uc >= 0x100 ||
2918 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2919 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2920 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2921 compat = 0;
2922 ANYOF_CLASS_ZERO(data->start_class);
2923 ANYOF_BITMAP_ZERO(data->start_class);
2924 if (compat) {
2925 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2926 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2927 data->start_class->flags |= ANYOF_FOLD;
2928 if (OP(scan) == EXACTFL)
2929 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2930 }
8aa23a47
YO
2931 }
2932 else if (flags & SCF_DO_STCLASS_OR) {
2933 if (data->start_class->flags & ANYOF_FOLD) {
2934 /* false positive possible if the class is case-folded.
2935 Assume that the locale settings are the same... */
1aa99e6b
IH
2936 if (uc < 0x100)
2937 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2938 data->start_class->flags &= ~ANYOF_EOS;
2939 }
8aa23a47 2940 cl_and(data->start_class, and_withp);
653099ff 2941 }
8aa23a47
YO
2942 flags &= ~SCF_DO_STCLASS;
2943 }
2944 else if (strchr((const char*)PL_varies,OP(scan))) {
2945 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2946 I32 f = flags, pos_before = 0;
2947 regnode * const oscan = scan;
2948 struct regnode_charclass_class this_class;
2949 struct regnode_charclass_class *oclass = NULL;
2950 I32 next_is_eval = 0;
2951
2952 switch (PL_regkind[OP(scan)]) {
2953 case WHILEM: /* End of (?:...)* . */
2954 scan = NEXTOPER(scan);
2955 goto finish;
2956 case PLUS:
2957 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2958 next = NEXTOPER(scan);
2959 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2960 mincount = 1;
2961 maxcount = REG_INFTY;
2962 next = regnext(scan);
2963 scan = NEXTOPER(scan);
2964 goto do_curly;
2965 }
2966 }
2967 if (flags & SCF_DO_SUBSTR)
2968 data->pos_min++;
2969 min++;
2970 /* Fall through. */
2971 case STAR:
2972 if (flags & SCF_DO_STCLASS) {
2973 mincount = 0;
2974 maxcount = REG_INFTY;
2975 next = regnext(scan);
2976 scan = NEXTOPER(scan);
2977 goto do_curly;
2978 }
2979 is_inf = is_inf_internal = 1;
2980 scan = regnext(scan);
c277df42 2981 if (flags & SCF_DO_SUBSTR) {
304ee84b 2982 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2983 data->longest = &(data->longest_float);
c277df42 2984 }
8aa23a47
YO
2985 goto optimize_curly_tail;
2986 case CURLY:
2987 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2988 && (scan->flags == stopparen))
2989 {
2990 mincount = 1;
2991 maxcount = 1;
2992 } else {
2993 mincount = ARG1(scan);
2994 maxcount = ARG2(scan);
653099ff 2995 }
8aa23a47
YO
2996 next = regnext(scan);
2997 if (OP(scan) == CURLYX) {
2998 I32 lp = (data ? *(data->last_closep) : 0);
2999 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3000 }
8aa23a47
YO
3001 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3002 next_is_eval = (OP(scan) == EVAL);
3003 do_curly:
3004 if (flags & SCF_DO_SUBSTR) {
304ee84b 3005 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3006 pos_before = data->pos_min;
b45f050a 3007 }
8aa23a47
YO
3008 if (data) {
3009 fl = data->flags;
3010 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3011 if (is_inf)
3012 data->flags |= SF_IS_INF;
3013 }
3014 if (flags & SCF_DO_STCLASS) {
3015 cl_init(pRExC_state, &this_class);
3016 oclass = data->start_class;
3017 data->start_class = &this_class;
3018 f |= SCF_DO_STCLASS_AND;
3019 f &= ~SCF_DO_STCLASS_OR;
3020 }
3021 /* These are the cases when once a subexpression
3022 fails at a particular position, it cannot succeed
3023 even after backtracking at the enclosing scope.
3024
3025 XXXX what if minimal match and we are at the
3026 initial run of {n,m}? */
3027 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3028 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3029
8aa23a47
YO
3030 /* This will finish on WHILEM, setting scan, or on NULL: */
3031 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3032 last, data, stopparen, recursed, NULL,
3033 (mincount == 0
3034 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3035
8aa23a47
YO
3036 if (flags & SCF_DO_STCLASS)
3037 data->start_class = oclass;
3038 if (mincount == 0 || minnext == 0) {
3039 if (flags & SCF_DO_STCLASS_OR) {
3040 cl_or(pRExC_state, data->start_class, &this_class);
3041 }
3042 else if (flags & SCF_DO_STCLASS_AND) {
3043 /* Switch to OR mode: cache the old value of
3044 * data->start_class */
3045 INIT_AND_WITHP;
3046 StructCopy(data->start_class, and_withp,
3047 struct regnode_charclass_class);
3048 flags &= ~SCF_DO_STCLASS_AND;
3049 StructCopy(&this_class, data->start_class,
3050 struct regnode_charclass_class);
3051 flags |= SCF_DO_STCLASS_OR;
3052 data->start_class->flags |= ANYOF_EOS;
3053 }
3054 } else { /* Non-zero len */
3055 if (flags & SCF_DO_STCLASS_OR) {
3056 cl_or(pRExC_state, data->start_class, &this_class);
3057 cl_and(data->start_class, and_withp);
3058 }
3059 else if (flags & SCF_DO_STCLASS_AND)
3060 cl_and(data->start_class, &this_class);
3061 flags &= ~SCF_DO_STCLASS;
3062 }
3063 if (!scan) /* It was not CURLYX, but CURLY. */
3064 scan = next;
3065 if ( /* ? quantifier ok, except for (?{ ... }) */
3066 (next_is_eval || !(mincount == 0 && maxcount == 1))
3067 && (minnext == 0) && (deltanext == 0)
3068 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3069 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3070 && ckWARN(WARN_REGEXP))
3071 {
3072 vWARN(RExC_parse,
3073 "Quantifier unexpected on zero-length expression");
3074 }
3075
3076 min += minnext * mincount;
3077 is_inf_internal |= ((maxcount == REG_INFTY
3078 && (minnext + deltanext) > 0)
3079 || deltanext == I32_MAX);
3080 is_inf |= is_inf_internal;
3081 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3082
3083 /* Try powerful optimization CURLYX => CURLYN. */
3084 if ( OP(oscan) == CURLYX && data
3085 && data->flags & SF_IN_PAR
3086 && !(data->flags & SF_HAS_EVAL)
3087 && !deltanext && minnext == 1 ) {
3088 /* Try to optimize to CURLYN. */
3089 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3090 regnode * const nxt1 = nxt;
497b47a8 3091#ifdef DEBUGGING
8aa23a47 3092 regnode *nxt2;
497b47a8 3093#endif
c277df42 3094
8aa23a47
YO
3095 /* Skip open. */
3096 nxt = regnext(nxt);
3097 if (!strchr((const char*)PL_simple,OP(nxt))
3098 && !(PL_regkind[OP(nxt)] == EXACT
3099 && STR_LEN(nxt) == 1))
3100 goto nogo;
497b47a8 3101#ifdef DEBUGGING
8aa23a47 3102 nxt2 = nxt;
497b47a8 3103#endif
8aa23a47
YO
3104 nxt = regnext(nxt);
3105 if (OP(nxt) != CLOSE)
3106 goto nogo;
3107 if (RExC_open_parens) {
3108 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3109 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3110 }
3111 /* Now we know that nxt2 is the only contents: */
3112 oscan->flags = (U8)ARG(nxt);
3113 OP(oscan) = CURLYN;
3114 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3115
c277df42 3116#ifdef DEBUGGING
8aa23a47
YO
3117 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3118 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3119 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3120 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3121 OP(nxt + 1) = OPTIMIZED; /* was count. */
3122 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3123#endif
8aa23a47
YO
3124 }
3125 nogo:
3126
3127 /* Try optimization CURLYX => CURLYM. */
3128 if ( OP(oscan) == CURLYX && data
3129 && !(data->flags & SF_HAS_PAR)
3130 && !(data->flags & SF_HAS_EVAL)
3131 && !deltanext /* atom is fixed width */
3132 && minnext != 0 /* CURLYM can't handle zero width */
3133 ) {
3134 /* XXXX How to optimize if data == 0? */
3135 /* Optimize to a simpler form. */
3136 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3137 regnode *nxt2;
3138
3139 OP(oscan) = CURLYM;
3140 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3141 && (OP(nxt2) != WHILEM))
3142 nxt = nxt2;
3143 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3144 /* Need to optimize away parenths. */
3145 if (data->flags & SF_IN_PAR) {
3146 /* Set the parenth number. */
3147 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3148
3149 if (OP(nxt) != CLOSE)
3150 FAIL("Panic opt close");
3151 oscan->flags = (U8)ARG(nxt);
3152 if (RExC_open_parens) {
3153 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3154 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3155 }
8aa23a47
YO
3156 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3157 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3158
c277df42 3159#ifdef DEBUGGING
8aa23a47
YO
3160 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3161 OP(nxt + 1) = OPTIMIZED; /* was count. */
3162 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3163 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3164#endif
c277df42 3165#if 0
8aa23a47
YO
3166 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3167 regnode *nnxt = regnext(nxt1);
3168
3169 if (nnxt == nxt) {
3170 if (reg_off_by_arg[OP(nxt1)])
3171 ARG_SET(nxt1, nxt2 - nxt1);
3172 else if (nxt2 - nxt1 < U16_MAX)
3173 NEXT_OFF(nxt1) = nxt2 - nxt1;
3174 else
3175 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3176 }
8aa23a47 3177 nxt1 = nnxt;
c277df42 3178 }
5d1c421c 3179#endif
8aa23a47
YO
3180 /* Optimize again: */
3181 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3182 NULL, stopparen, recursed, NULL, 0,depth+1);
3183 }
3184 else
3185 oscan->flags = 0;
3186 }
3187 else if ((OP(oscan) == CURLYX)
3188 && (flags & SCF_WHILEM_VISITED_POS)
3189 /* See the comment on a similar expression above.
3190 However, this time it not a subexpression
3191 we care about, but the expression itself. */
3192 && (maxcount == REG_INFTY)
3193 && data && ++data->whilem_c < 16) {
3194 /* This stays as CURLYX, we can put the count/of pair. */
3195 /* Find WHILEM (as in regexec.c) */
3196 regnode *nxt = oscan + NEXT_OFF(oscan);
3197
3198 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3199 nxt += ARG(nxt);
3200 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3201 | (RExC_whilem_seen << 4)); /* On WHILEM */
3202 }
3203 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3204 pars++;
3205 if (flags & SCF_DO_SUBSTR) {
3206 SV *last_str = NULL;
3207 int counted = mincount != 0;
a0ed51b3 3208
8aa23a47
YO
3209 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3210#if defined(SPARC64_GCC_WORKAROUND)
3211 I32 b = 0;
3212 STRLEN l = 0;
3213 const char *s = NULL;
3214 I32 old = 0;
b515a41d 3215
8aa23a47
YO
3216 if (pos_before >= data->last_start_min)
3217 b = pos_before;
3218 else
3219 b = data->last_start_min;
b515a41d 3220
8aa23a47
YO
3221 l = 0;
3222 s = SvPV_const(data->last_found, l);
3223 old = b - data->last_start_min;
3224
3225#else
3226 I32 b = pos_before >= data->last_start_min
3227 ? pos_before : data->last_start_min;
3228 STRLEN l;
3229 const char * const s = SvPV_const(data->last_found, l);
3230 I32 old = b - data->last_start_min;
3231#endif
3232
3233 if (UTF)
3234 old = utf8_hop((U8*)s, old) - (U8*)s;
3235
3236 l -= old;
3237 /* Get the added string: */
3238 last_str = newSVpvn(s + old, l);
3239 if (UTF)
3240 SvUTF8_on(last_str);
3241 if (deltanext == 0 && pos_before == b) {
3242 /* What was added is a constant string */
3243 if (mincount > 1) {
3244 SvGROW(last_str, (mincount * l) + 1);
3245 repeatcpy(SvPVX(last_str) + l,
3246 SvPVX_const(last_str), l, mincount - 1);
3247 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3248 /* Add additional parts. */
3249 SvCUR_set(data->last_found,
3250 SvCUR(data->last_found) - l);
3251 sv_catsv(data->last_found, last_str);
3252 {
3253 SV * sv = data->last_found;
3254 MAGIC *mg =
3255 SvUTF8(sv) && SvMAGICAL(sv) ?
3256 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3257 if (mg && mg->mg_len >= 0)
3258 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3259 }
8aa23a47 3260 data->last_end += l * (mincount - 1);
b515a41d 3261 }
8aa23a47
YO
3262 } else {
3263 /* start offset must point into the last copy */
3264 data->last_start_min += minnext * (mincount - 1);
3265 data->last_start_max += is_inf ? I32_MAX
3266 : (maxcount - 1) * (minnext + data->pos_delta);
3267 }
c277df42 3268 }
8aa23a47
YO
3269 /* It is counted once already... */
3270 data->pos_min += minnext * (mincount - counted);
3271 data->pos_delta += - counted * deltanext +
3272 (minnext + deltanext) * maxcount - minnext * mincount;
3273 if (mincount != maxcount) {
3274 /* Cannot extend fixed substrings found inside
3275 the group. */
304ee84b 3276 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3277 if (mincount && last_str) {
3278 SV * const sv = data->last_found;
3279 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3280 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3281
3282 if (mg)
3283 mg->mg_len = -1;
3284 sv_setsv(sv, last_str);
3285 data->last_end = data->pos_min;
3286 data->last_start_min =
3287 data->pos_min - CHR_SVLEN(last_str);
3288 data->last_start_max = is_inf
3289 ? I32_MAX
3290 : data->pos_min + data->pos_delta
3291 - CHR_SVLEN(last_str);
3292 }
3293 data->longest = &(data->longest_float);
3294 }
3295 SvREFCNT_dec(last_str);
c277df42 3296 }
8aa23a47
YO
3297 if (data && (fl & SF_HAS_EVAL))
3298 data->flags |= SF_HAS_EVAL;
3299 optimize_curly_tail:
3300 if (OP(oscan) != CURLYX) {
3301 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3302 && NEXT_OFF(next))
3303 NEXT_OFF(oscan) += NEXT_OFF(next);
3304 }
3305 continue;
3306 default: /* REF and CLUMP only? */
3307 if (flags & SCF_DO_SUBSTR) {
304ee84b 3308 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3309 data->longest = &(data->longest_float);
3310 }
3311 is_inf = is_inf_internal = 1;
3312 if (flags & SCF_DO_STCLASS_OR)
3313 cl_anything(pRExC_state, data->start_class);
3314 flags &= ~SCF_DO_STCLASS;
3315 break;
c277df42 3316 }
8aa23a47
YO
3317 }
3318 else if (strchr((const char*)PL_simple,OP(scan))) {
3319 int value = 0;
653099ff 3320
8aa23a47 3321 if (flags & SCF_DO_SUBSTR) {
304ee84b 3322 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3323 data->pos_min++;
3324 }
3325 min++;
3326 if (flags & SCF_DO_STCLASS) {
3327 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3328
8aa23a47
YO
3329 /* Some of the logic below assumes that switching
3330 locale on will only add false positives. */
3331 switch (PL_regkind[OP(scan)]) {
3332 case SANY:
3333 default:
3334 do_default:
3335 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3336 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3337 cl_anything(pRExC_state, data->start_class);
3338 break;
3339 case REG_ANY:
3340 if (OP(scan) == SANY)
3341 goto do_default;
3342 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3343 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3344 || (data->start_class->flags & ANYOF_CLASS));
3345 cl_anything(pRExC_state, data->start_class);
653099ff 3346 }
8aa23a47
YO
3347 if (flags & SCF_DO_STCLASS_AND || !value)
3348 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3349 break;
3350 case ANYOF:
3351 if (flags & SCF_DO_STCLASS_AND)
3352 cl_and(data->start_class,
3353 (struct regnode_charclass_class*)scan);
653099ff 3354 else
8aa23a47
YO
3355 cl_or(pRExC_state, data->start_class,
3356 (struct regnode_charclass_class*)scan);
3357 break;
3358 case ALNUM:
3359 if (flags & SCF_DO_STCLASS_AND) {
3360 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3361 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3362 for (value = 0; value < 256; value++)
3363 if (!isALNUM(value))
3364 ANYOF_BITMAP_CLEAR(data->start_class, value);
3365 }
653099ff 3366 }
8aa23a47
YO
3367 else {
3368 if (data->start_class->flags & ANYOF_LOCALE)
3369 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3370 else {
3371 for (value = 0; value < 256; value++)
3372 if (isALNUM(value))
3373 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3374 }
8aa23a47
YO
3375 }
3376 break;
3377 case ALNUML:
3378 if (flags & SCF_DO_STCLASS_AND) {
3379 if (data->start_class->flags & ANYOF_LOCALE)
3380 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3381 }
3382 else {
3383 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3384 data->start_class->flags |= ANYOF_LOCALE;
3385 }
3386 break;
3387 case NALNUM:
3388 if (flags & SCF_DO_STCLASS_AND) {
3389 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3390 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3391 for (value = 0; value < 256; value++)
3392 if (isALNUM(value))
3393 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3394 }
3395 }
8aa23a47
YO
3396 else {
3397 if (data->start_class->flags & ANYOF_LOCALE)
3398 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3399 else {
3400 for (value = 0; value < 256; value++)
3401 if (!isALNUM(value))
3402 ANYOF_BITMAP_SET(data->start_class, value);
3403 }
653099ff 3404 }
8aa23a47
YO
3405 break;
3406 case NALNUML:
3407 if (flags & SCF_DO_STCLASS_AND) {
3408 if (data->start_class->flags & ANYOF_LOCALE)
3409 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3410 }
8aa23a47
YO
3411 else {
3412 data->start_class->flags |= ANYOF_LOCALE;
3413 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3414 }
3415 break;
3416 case SPACE:
3417 if (flags & SCF_DO_STCLASS_AND) {
3418 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3419 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3420 for (value = 0; value < 256; value++)
3421 if (!isSPACE(value))
3422 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3423 }
3424 }
8aa23a47
YO
3425 else {
3426 if (data->start_class->flags & ANYOF_LOCALE)
3427 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3428 else {
3429 for (value = 0; value < 256; value++)
3430 if (isSPACE(value))
3431 ANYOF_BITMAP_SET(data->start_class, value);
3432 }
653099ff 3433 }
8aa23a47
YO
3434 break;
3435 case SPACEL:
3436 if (flags & SCF_DO_STCLASS_AND) {
3437 if (data->start_class->flags & ANYOF_LOCALE)
3438 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3439 }
3440 else {
3441 data->start_class->flags |= ANYOF_LOCALE;
3442 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3443 }
3444 break;
3445 case NSPACE:
3446 if (flags & SCF_DO_STCLASS_AND) {
3447 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3448 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3449 for (value = 0; value < 256; value++)
3450 if (isSPACE(value))
3451 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3452 }
8aa23a47
YO
3453 }
3454 else {
3455 if (data->start_class->flags & ANYOF_LOCALE)
3456 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3457 else {
3458 for (value = 0; value < 256; value++)
3459 if (!isSPACE(value))
3460 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3461 }
3462 }
8aa23a47
YO
3463 break;
3464 case NSPACEL:
3465 if (flags & SCF_DO_STCLASS_AND) {
3466 if (data->start_class->flags & ANYOF_LOCALE) {
3467 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3468 for (value = 0; value < 256; value++)
3469 if (!isSPACE(value))
3470 ANYOF_BITMAP_CLEAR(data->start_class, value);
3471 }
653099ff 3472 }
8aa23a47
YO
3473 else {
3474 data->start_class->flags |= ANYOF_LOCALE;
3475 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3476 }
3477 break;
3478 case DIGIT:
3479 if (flags & SCF_DO_STCLASS_AND) {
3480 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3481 for (value = 0; value < 256; value++)
3482 if (!isDIGIT(value))
3483 ANYOF_BITMAP_CLEAR(data->start_class, value);
3484 }
3485 else {
3486 if (data->start_class->flags & ANYOF_LOCALE)
3487 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3488 else {
3489 for (value = 0; value < 256; value++)
3490 if (isDIGIT(value))
3491 ANYOF_BITMAP_SET(data->start_class, value);
3492 }
3493 }
3494 break;
3495 case NDIGIT:
3496 if (flags & SCF_DO_STCLASS_AND) {
3497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3498 for (value = 0; value < 256; value++)
3499 if (isDIGIT(value))
3500 ANYOF_BITMAP_CLEAR(data->start_class, value);
3501 }
3502 else {
3503 if (data->start_class->flags & ANYOF_LOCALE)
3504 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3505 else {
3506 for (value = 0; value < 256; value++)
3507 if (!isDIGIT(value))
3508 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3509 }
3510 }
8aa23a47
YO
3511 break;
3512 }
3513 if (flags & SCF_DO_STCLASS_OR)
3514 cl_and(data->start_class, and_withp);
3515 flags &= ~SCF_DO_STCLASS;
3516 }
3517 }
3518 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3519 data->flags |= (OP(scan) == MEOL
3520 ? SF_BEFORE_MEOL
3521 : SF_BEFORE_SEOL);
3522 }
3523 else if ( PL_regkind[OP(scan)] == BRANCHJ
3524 /* Lookbehind, or need to calculate parens/evals/stclass: */
3525 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3526 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3527 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3528 || OP(scan) == UNLESSM )
3529 {
3530 /* Negative Lookahead/lookbehind
3531 In this case we can't do fixed string optimisation.
3532 */
1de06328 3533
8aa23a47
YO
3534 I32 deltanext, minnext, fake = 0;
3535 regnode *nscan;
3536 struct regnode_charclass_class intrnl;
3537 int f = 0;
1de06328 3538
8aa23a47
YO
3539 data_fake.flags = 0;
3540 if (data) {
3541 data_fake.whilem_c = data->whilem_c;
3542 data_fake.last_closep = data->last_closep;
c277df42 3543 }
8aa23a47
YO
3544 else
3545 data_fake.last_closep = &fake;
58e23c8d 3546 data_fake.pos_delta = delta;
8aa23a47
YO
3547 if ( flags & SCF_DO_STCLASS && !scan->flags
3548 && OP(scan) == IFMATCH ) { /* Lookahead */
3549 cl_init(pRExC_state, &intrnl);
3550 data_fake.start_class = &intrnl;
3551 f |= SCF_DO_STCLASS_AND;
3552 }
3553 if (flags & SCF_WHILEM_VISITED_POS)
3554 f |= SCF_WHILEM_VISITED_POS;
3555 next = regnext(scan);
3556 nscan = NEXTOPER(NEXTOPER(scan));
3557 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3558 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3559 if (scan->flags) {
3560 if (deltanext) {
58e23c8d 3561 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3562 }
3563 else if (minnext > (I32)U8_MAX) {
58e23c8d 3564 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3565 }
3566 scan->flags = (U8)minnext;
3567 }
3568 if (data) {
3569 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3570 pars++;
3571 if (data_fake.flags & SF_HAS_EVAL)
3572 data->flags |= SF_HAS_EVAL;
3573 data->whilem_c = data_fake.whilem_c;
3574 }
3575 if (f & SCF_DO_STCLASS_AND) {
3576 const int was = (data->start_class->flags & ANYOF_EOS);
3577
3578 cl_and(data->start_class, &intrnl);
3579 if (was)
3580 data->start_class->flags |= ANYOF_EOS;
3581 }
cb434fcc 3582 }
8aa23a47
YO
3583#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3584 else {
3585 /* Positive Lookahead/lookbehind
3586 In this case we can do fixed string optimisation,
3587 but we must be careful about it. Note in the case of
3588 lookbehind the positions will be offset by the minimum
3589 length of the pattern, something we won't know about
3590 until after the recurse.
3591 */
3592 I32 deltanext, fake = 0;
3593 regnode *nscan;
3594 struct regnode_charclass_class intrnl;
3595 int f = 0;
3596 /* We use SAVEFREEPV so that when the full compile
3597 is finished perl will clean up the allocated
3598 minlens when its all done. This was we don't
3599 have to worry about freeing them when we know
3600 they wont be used, which would be a pain.
3601 */
3602 I32 *minnextp;
3603 Newx( minnextp, 1, I32 );
3604 SAVEFREEPV(minnextp);
3605
3606 if (data) {
3607 StructCopy(data, &data_fake, scan_data_t);
3608 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3609 f |= SCF_DO_SUBSTR;
3610 if (scan->flags)
304ee84b 3611 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3612 data_fake.last_found=newSVsv(data->last_found);
3613 }
3614 }
3615 else
3616 data_fake.last_closep = &fake;
3617 data_fake.flags = 0;
58e23c8d 3618 data_fake.pos_delta = delta;
8aa23a47
YO
3619 if (is_inf)
3620 data_fake.flags |= SF_IS_INF;
3621 if ( flags & SCF_DO_STCLASS && !scan->flags
3622 && OP(scan) == IFMATCH ) { /* Lookahead */
3623 cl_init(pRExC_state, &intrnl);
3624 data_fake.start_class = &intrnl;
3625 f |= SCF_DO_STCLASS_AND;
3626 }
3627 if (flags & SCF_WHILEM_VISITED_POS)
3628 f |= SCF_WHILEM_VISITED_POS;
3629 next = regnext(scan);
3630 nscan = NEXTOPER(NEXTOPER(scan));
3631
3632 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3633 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3634 if (scan->flags) {
3635 if (deltanext) {
58e23c8d 3636 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3637 }
3638 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3639 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3640 }
3641 scan->flags = (U8)*minnextp;
3642 }
3643
3644 *minnextp += min;
3645
3646 if (f & SCF_DO_STCLASS_AND) {
3647 const int was = (data->start_class->flags & ANYOF_EOS);
3648
3649 cl_and(data->start_class, &intrnl);
3650 if (was)
3651 data->start_class->flags |= ANYOF_EOS;
3652 }
3653 if (data) {
3654 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3655 pars++;
3656 if (data_fake.flags & SF_HAS_EVAL)
3657 data->flags |= SF_HAS_EVAL;
3658 data->whilem_c = data_fake.whilem_c;
3659 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3660 if (RExC_rx->minlen<*minnextp)
3661 RExC_rx->minlen=*minnextp;
304ee84b 3662 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3663 SvREFCNT_dec(data_fake.last_found);
3664
3665 if ( data_fake.minlen_fixed != minlenp )
3666 {
3667 data->offset_fixed= data_fake.offset_fixed;
3668 data->minlen_fixed= data_fake.minlen_fixed;
3669 data->lookbehind_fixed+= scan->flags;
3670 }
3671 if ( data_fake.minlen_float != minlenp )
3672 {
3673 data->minlen_float= data_fake.minlen_float;
3674 data->offset_float_min=data_fake.offset_float_min;
3675 data->offset_float_max=data_fake.offset_float_max;
3676 data->lookbehind_float+= scan->flags;
3677 }
3678 }
3679 }
3680
3681
40d049e4 3682 }
8aa23a47
YO
3683#endif
3684 }
3685 else if (OP(scan) == OPEN) {
3686 if (stopparen != (I32)ARG(scan))
3687 pars++;
3688 }
3689 else if (OP(scan) == CLOSE) {
3690 if (stopparen == (I32)ARG(scan)) {
3691 break;
3692 }
3693 if ((I32)ARG(scan) == is_par) {
3694 next = regnext(scan);
b515a41d 3695
8aa23a47
YO
3696 if ( next && (OP(next) != WHILEM) && next < last)
3697 is_par = 0; /* Disable optimization */
40d049e4 3698 }
8aa23a47
YO
3699 if (data)
3700 *(data->last_closep) = ARG(scan);
3701 }
3702 else if (OP(scan) == EVAL) {
c277df42
IZ
3703 if (data)
3704 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3705 }
3706 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3707 if (flags & SCF_DO_SUBSTR) {
304ee84b 3708 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3709 flags &= ~SCF_DO_SUBSTR;
40d049e4 3710 }
8aa23a47
YO
3711 if (data && OP(scan)==ACCEPT) {
3712 data->flags |= SCF_SEEN_ACCEPT;
3713 if (stopmin > min)
3714 stopmin = min;
e2e6a0f1 3715 }
8aa23a47
YO
3716 }
3717 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3718 {
0f5d15d6 3719 if (flags & SCF_DO_SUBSTR) {
304ee84b 3720 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3721 data->longest = &(data->longest_float);
3722 }
3723 is_inf = is_inf_internal = 1;
653099ff 3724 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3725 cl_anything(pRExC_state, data->start_class);
96776eda 3726 flags &= ~SCF_DO_STCLASS;
8aa23a47 3727 }
58e23c8d 3728 else if (OP(scan) == GPOS) {
bbe252da 3729 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3730 !(delta || is_inf || (data && data->pos_delta)))
3731 {
bbe252da
YO
3732 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3733 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3734 if (RExC_rx->gofs < (U32)min)
3735 RExC_rx->gofs = min;
3736 } else {
bbe252da 3737 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3738 RExC_rx->gofs = 0;
3739 }
3740 }
786e8c11 3741#ifdef TRIE_STUDY_OPT
40d049e4 3742#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3743 else if (PL_regkind[OP(scan)] == TRIE) {
3744 /* NOTE - There is similar code to this block above for handling
3745 BRANCH nodes on the initial study. If you change stuff here
3746 check there too. */
3747 regnode *trie_node= scan;
3748 regnode *tail= regnext(scan);
f8fc2ecf 3749 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3750 I32 max1 = 0, min1 = I32_MAX;
3751 struct regnode_charclass_class accum;
3752
3753 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3754 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3755 if (flags & SCF_DO_STCLASS)
3756 cl_init_zero(pRExC_state, &accum);
3757
3758 if (!trie->jump) {
3759 min1= trie->minlen;
3760 max1= trie->maxlen;
3761 } else {
3762 const regnode *nextbranch= NULL;
3763 U32 word;
3764
3765 for ( word=1 ; word <= trie->wordcount ; word++)
3766 {
3767 I32 deltanext=0, minnext=0, f = 0, fake;
3768 struct regnode_charclass_class this_class;
3769
3770 data_fake.flags = 0;
3771 if (data) {
3772 data_fake.whilem_c = data->whilem_c;
3773 data_fake.last_closep = data->last_closep;
3774 }
3775 else
3776 data_fake.last_closep = &fake;
58e23c8d 3777 data_fake.pos_delta = delta;
8aa23a47
YO
3778 if (flags & SCF_DO_STCLASS) {
3779 cl_init(pRExC_state, &this_class);
3780 data_fake.start_class = &this_class;
3781 f = SCF_DO_STCLASS_AND;
3782 }
3783 if (flags & SCF_WHILEM_VISITED_POS)
3784 f |= SCF_WHILEM_VISITED_POS;
3785
3786 if (trie->jump[word]) {
3787 if (!nextbranch)
3788 nextbranch = trie_node + trie->jump[0];
3789 scan= trie_node + trie->jump[word];
3790 /* We go from the jump point to the branch that follows
3791 it. Note this means we need the vestigal unused branches
3792 even though they arent otherwise used.
3793 */
3794 minnext = study_chunk(pRExC_state, &scan, minlenp,
3795 &deltanext, (regnode *)nextbranch, &data_fake,
3796 stopparen, recursed, NULL, f,depth+1);
3797 }
3798 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3799 nextbranch= regnext((regnode*)nextbranch);
3800
3801 if (min1 > (I32)(minnext + trie->minlen))
3802 min1 = minnext + trie->minlen;
3803 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3804 max1 = minnext + deltanext + trie->maxlen;
3805 if (deltanext == I32_MAX)
3806 is_inf = is_inf_internal = 1;
3807
3808 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3809 pars++;
3810 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3811 if ( stopmin > min + min1)
3812 stopmin = min