This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix a couple of casting warnings
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
6bda09f9 120 regnode **parens; /* offsets of each paren */
1aa99e6b 121 I32 utf8;
6bda09f9 122 HV *charnames; /* cache of named sequences */
830247a4
IZ
123#if ADD_TO_REGEXEC
124 char *starttry; /* -Dr: where regtry was called. */
125#define RExC_starttry (pRExC_state->starttry)
126#endif
3dab1dad 127#ifdef DEBUGGING
be8e71aa 128 const char *lastparse;
3dab1dad
YO
129 I32 lastnum;
130#define RExC_lastparse (pRExC_state->lastparse)
131#define RExC_lastnum (pRExC_state->lastnum)
132#endif
830247a4
IZ
133} RExC_state_t;
134
e2509266 135#define RExC_flags (pRExC_state->flags)
830247a4
IZ
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 154#define RExC_charnames (pRExC_state->charnames)
6bda09f9 155#define RExC_parens (pRExC_state->parens)
830247a4 156
a687059c
LW
157#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
158#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
159 ((*s) == '{' && regcurly(s)))
a687059c 160
35c8bce7
LW
161#ifdef SPSTART
162#undef SPSTART /* dratted cpp namespace... */
163#endif
a687059c
LW
164/*
165 * Flags to be passed up and down.
166 */
a687059c 167#define WORST 0 /* Worst case. */
821b33a5 168#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
169#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
170#define SPSTART 0x4 /* Starts with * or +. */
171#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 172
3dab1dad
YO
173#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
174
07be1b83
YO
175/* whether trie related optimizations are enabled */
176#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
177#define TRIE_STUDY_OPT
786e8c11 178#define FULL_TRIE_STUDY
07be1b83
YO
179#define TRIE_STCLASS
180#endif
1de06328
YO
181
182
183/* About scan_data_t.
184
185 During optimisation we recurse through the regexp program performing
186 various inplace (keyhole style) optimisations. In addition study_chunk
187 and scan_commit populate this data structure with information about
188 what strings MUST appear in the pattern. We look for the longest
189 string that must appear for at a fixed location, and we look for the
190 longest string that may appear at a floating location. So for instance
191 in the pattern:
192
193 /FOO[xX]A.*B[xX]BAR/
194
195 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
196 strings (because they follow a .* construct). study_chunk will identify
197 both FOO and BAR as being the longest fixed and floating strings respectively.
198
199 The strings can be composites, for instance
200
201 /(f)(o)(o)/
202
203 will result in a composite fixed substring 'foo'.
204
205 For each string some basic information is maintained:
206
207 - offset or min_offset
208 This is the position the string must appear at, or not before.
209 It also implicitly (when combined with minlenp) tells us how many
210 character must match before the string we are searching.
211 Likewise when combined with minlenp and the length of the string
212 tells us how many characters must appear after the string we have
213 found.
214
215 - max_offset
216 Only used for floating strings. This is the rightmost point that
217 the string can appear at. Ifset to I32 max it indicates that the
218 string can occur infinitely far to the right.
219
220 - minlenp
221 A pointer to the minimum length of the pattern that the string
222 was found inside. This is important as in the case of positive
223 lookahead or positive lookbehind we can have multiple patterns
224 involved. Consider
225
226 /(?=FOO).*F/
227
228 The minimum length of the pattern overall is 3, the minimum length
229 of the lookahead part is 3, but the minimum length of the part that
230 will actually match is 1. So 'FOO's minimum length is 3, but the
231 minimum length for the F is 1. This is important as the minimum length
232 is used to determine offsets in front of and behind the string being
233 looked for. Since strings can be composites this is the length of the
234 pattern at the time it was commited with a scan_commit. Note that
235 the length is calculated by study_chunk, so that the minimum lengths
236 are not known until the full pattern has been compiled, thus the
237 pointer to the value.
238
239 - lookbehind
240
241 In the case of lookbehind the string being searched for can be
242 offset past the start point of the final matching string.
243 If this value was just blithely removed from the min_offset it would
244 invalidate some of the calculations for how many chars must match
245 before or after (as they are derived from min_offset and minlen and
246 the length of the string being searched for).
247 When the final pattern is compiled and the data is moved from the
248 scan_data_t structure into the regexp structure the information
249 about lookbehind is factored in, with the information that would
250 have been lost precalculated in the end_shift field for the
251 associated string.
252
253 The fields pos_min and pos_delta are used to store the minimum offset
254 and the delta to the maximum offset at the current point in the pattern.
255
256*/
2c2d71f5
JH
257
258typedef struct scan_data_t {
1de06328
YO
259 /*I32 len_min; unused */
260 /*I32 len_delta; unused */
2c2d71f5
JH
261 I32 pos_min;
262 I32 pos_delta;
263 SV *last_found;
1de06328 264 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
265 I32 last_start_min;
266 I32 last_start_max;
1de06328
YO
267 SV **longest; /* Either &l_fixed, or &l_float. */
268 SV *longest_fixed; /* longest fixed string found in pattern */
269 I32 offset_fixed; /* offset where it starts */
270 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
271 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
272 SV *longest_float; /* longest floating string found in pattern */
273 I32 offset_float_min; /* earliest point in string it can appear */
274 I32 offset_float_max; /* latest point in string it can appear */
275 I32 *minlen_float; /* pointer to the minlen relevent to the string */
276 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
277 I32 flags;
278 I32 whilem_c;
cb434fcc 279 I32 *last_closep;
653099ff 280 struct regnode_charclass_class *start_class;
2c2d71f5
JH
281} scan_data_t;
282
a687059c 283/*
e50aee73 284 * Forward declarations for pregcomp()'s friends.
a687059c 285 */
a0d0e21e 286
27da23d5 287static const scan_data_t zero_scan_data =
1de06328 288 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
289
290#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
291#define SF_BEFORE_SEOL 0x0001
292#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
293#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
294#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
295
09b7f37c
CB
296#ifdef NO_UNARY_PLUS
297# define SF_FIX_SHIFT_EOL (0+2)
298# define SF_FL_SHIFT_EOL (0+4)
299#else
300# define SF_FIX_SHIFT_EOL (+2)
301# define SF_FL_SHIFT_EOL (+4)
302#endif
c277df42
IZ
303
304#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
305#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
306
307#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
308#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
309#define SF_IS_INF 0x0040
310#define SF_HAS_PAR 0x0080
311#define SF_IN_PAR 0x0100
312#define SF_HAS_EVAL 0x0200
313#define SCF_DO_SUBSTR 0x0400
653099ff
GS
314#define SCF_DO_STCLASS_AND 0x0800
315#define SCF_DO_STCLASS_OR 0x1000
316#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 317#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 318
786e8c11
YO
319#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
320
07be1b83 321
eb160463 322#define UTF (RExC_utf8 != 0)
e2509266
JH
323#define LOC ((RExC_flags & PMf_LOCALE) != 0)
324#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 325
ffc61ed2 326#define OOB_UNICODE 12345678
93733859 327#define OOB_NAMEDCLASS -1
b8c5462f 328
a0ed51b3
LW
329#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
330#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
331
8615cb43 332
b45f050a
JF
333/* length of regex to show in messages that don't mark a position within */
334#define RegexLengthToShowInErrorMessages 127
335
336/*
337 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
338 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
339 * op/pragma/warn/regcomp.
340 */
7253e4e3
RK
341#define MARKER1 "<-- HERE" /* marker as it appears in the description */
342#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 343
7253e4e3 344#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
345
346/*
347 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
348 * arg. Show regex, up to a maximum length. If it's too long, chop and add
349 * "...".
350 */
ccb2c380 351#define FAIL(msg) STMT_START { \
bfed75c6 352 const char *ellipses = ""; \
ccb2c380
MP
353 IV len = RExC_end - RExC_precomp; \
354 \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 if (len > RegexLengthToShowInErrorMessages) { \
358 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
359 len = RegexLengthToShowInErrorMessages - 10; \
360 ellipses = "..."; \
361 } \
362 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
363 msg, (int)len, RExC_precomp, ellipses); \
364} STMT_END
8615cb43 365
b45f050a 366/*
b45f050a
JF
367 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
368 */
ccb2c380 369#define Simple_vFAIL(m) STMT_START { \
a28509cc 370 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
371 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
b45f050a
JF
374
375/*
376 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
377 */
ccb2c380
MP
378#define vFAIL(m) STMT_START { \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 Simple_vFAIL(m); \
382} STMT_END
b45f050a
JF
383
384/*
385 * Like Simple_vFAIL(), but accepts two arguments.
386 */
ccb2c380 387#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 388 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
389 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
390 (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
b45f050a
JF
392
393/*
394 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
395 */
ccb2c380
MP
396#define vFAIL2(m,a1) STMT_START { \
397 if (!SIZE_ONLY) \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
399 Simple_vFAIL2(m, a1); \
400} STMT_END
b45f050a
JF
401
402
403/*
404 * Like Simple_vFAIL(), but accepts three arguments.
405 */
ccb2c380 406#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 407 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
408 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
409 (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
b45f050a
JF
411
412/*
413 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
414 */
ccb2c380
MP
415#define vFAIL3(m,a1,a2) STMT_START { \
416 if (!SIZE_ONLY) \
417 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
418 Simple_vFAIL3(m, a1, a2); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Like Simple_vFAIL(), but accepts four arguments.
423 */
ccb2c380 424#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a 429
ccb2c380 430#define vWARN(loc,m) STMT_START { \
a28509cc 431 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
432 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
433 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
434} STMT_END
435
436#define vWARNdep(loc,m) STMT_START { \
a28509cc 437 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
438 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
439 "%s" REPORT_LOCATION, \
440 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
442
443
444#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 445 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
446 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
447 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
449
450#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 451 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
453 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
455
456#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 457 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
458 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
459 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
460} STMT_END
461
462#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 463 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
464 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
465 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
466} STMT_END
9d1d55b5 467
8615cb43 468
cd439c50 469/* Allow for side effects in s */
ccb2c380
MP
470#define REGC(c,s) STMT_START { \
471 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
472} STMT_END
cd439c50 473
fac92740
MJD
474/* Macros for recording node offsets. 20001227 mjd@plover.com
475 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
476 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
477 * Element 0 holds the number n.
07be1b83 478 * Position is 1 indexed.
fac92740
MJD
479 */
480
ccb2c380
MP
481#define Set_Node_Offset_To_R(node,byte) STMT_START { \
482 if (! SIZE_ONLY) { \
483 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 484 __LINE__, (node), (int)(byte))); \
ccb2c380 485 if((node) < 0) { \
551405c4 486 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
487 } else { \
488 RExC_offsets[2*(node)-1] = (byte); \
489 } \
490 } \
491} STMT_END
492
493#define Set_Node_Offset(node,byte) \
494 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
495#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
496
497#define Set_Node_Length_To_R(node,len) STMT_START { \
498 if (! SIZE_ONLY) { \
499 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 500 __LINE__, (int)(node), (int)(len))); \
ccb2c380 501 if((node) < 0) { \
551405c4 502 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
503 } else { \
504 RExC_offsets[2*(node)] = (len); \
505 } \
506 } \
507} STMT_END
508
509#define Set_Node_Length(node,len) \
510 Set_Node_Length_To_R((node)-RExC_emit_start, len)
511#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
512#define Set_Node_Cur_Length(node) \
513 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
514
515/* Get offsets and lengths */
516#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
517#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
518
07be1b83
YO
519#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
520 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
521 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
522} STMT_END
523
524
525#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
526#define EXPERIMENTAL_INPLACESCAN
527#endif
528
1de06328 529#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 530DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
531 PerlIO_printf(Perl_debug_log, \
532 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
533 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
534 (int)(depth)*2, "", \
535 (IV)((data)->pos_min), \
536 (IV)((data)->pos_delta), \
537 (IV)((data)->flags), \
538 (IV)((data)->whilem_c), \
539 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
540 ); \
541 if ((data)->last_found) \
542 PerlIO_printf(Perl_debug_log, \
543 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
544 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
545 SvPVX_const((data)->last_found), \
546 (IV)((data)->last_end), \
547 (IV)((data)->last_start_min), \
548 (IV)((data)->last_start_max), \
549 ((data)->longest && \
550 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
551 SvPVX_const((data)->longest_fixed), \
552 (IV)((data)->offset_fixed), \
553 ((data)->longest && \
554 (data)->longest==&((data)->longest_float)) ? "*" : "", \
555 SvPVX_const((data)->longest_float), \
556 (IV)((data)->offset_float_min), \
557 (IV)((data)->offset_float_max) \
558 ); \
559 PerlIO_printf(Perl_debug_log,"\n"); \
560});
561
acfe0abc 562static void clear_re(pTHX_ void *r);
4327152a 563
653099ff 564/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 565 Update the longest found anchored substring and the longest found
653099ff
GS
566 floating substrings if needed. */
567
4327152a 568STATIC void
1de06328 569S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 570{
e1ec3a88
AL
571 const STRLEN l = CHR_SVLEN(data->last_found);
572 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 573 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 574
c277df42 575 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 576 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
577 if (*data->longest == data->longest_fixed) {
578 data->offset_fixed = l ? data->last_start_min : data->pos_min;
579 if (data->flags & SF_BEFORE_EOL)
b81d288d 580 data->flags
c277df42
IZ
581 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
582 else
583 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
584 data->minlen_fixed=minlenp;
585 data->lookbehind_fixed=0;
a0ed51b3
LW
586 }
587 else {
c277df42 588 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
589 data->offset_float_max = (l
590 ? data->last_start_max
c277df42 591 : data->pos_min + data->pos_delta);
9051bda5
HS
592 if ((U32)data->offset_float_max > (U32)I32_MAX)
593 data->offset_float_max = I32_MAX;
c277df42 594 if (data->flags & SF_BEFORE_EOL)
b81d288d 595 data->flags
c277df42
IZ
596 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
597 else
598 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
599 data->minlen_float=minlenp;
600 data->lookbehind_float=0;
c277df42
IZ
601 }
602 }
603 SvCUR_set(data->last_found, 0);
0eda9292 604 {
a28509cc 605 SV * const sv = data->last_found;
097eb12c
AL
606 if (SvUTF8(sv) && SvMAGICAL(sv)) {
607 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
608 if (mg)
609 mg->mg_len = 0;
610 }
0eda9292 611 }
c277df42
IZ
612 data->last_end = -1;
613 data->flags &= ~SF_BEFORE_EOL;
1de06328 614 DEBUG_STUDYDATA(data,0);
c277df42
IZ
615}
616
653099ff
GS
617/* Can match anything (initialization) */
618STATIC void
097eb12c 619S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 620{
653099ff 621 ANYOF_CLASS_ZERO(cl);
f8bef550 622 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 623 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
624 if (LOC)
625 cl->flags |= ANYOF_LOCALE;
626}
627
628/* Can match anything (initialization) */
629STATIC int
5f66b61c 630S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
631{
632 int value;
633
aaa51d5e 634 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
635 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
636 return 1;
1aa99e6b
IH
637 if (!(cl->flags & ANYOF_UNICODE_ALL))
638 return 0;
10edeb5d 639 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 640 return 0;
653099ff
GS
641 return 1;
642}
643
644/* Can match anything (initialization) */
645STATIC void
097eb12c 646S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 647{
8ecf7187 648 Zero(cl, 1, struct regnode_charclass_class);
653099ff 649 cl->type = ANYOF;
830247a4 650 cl_anything(pRExC_state, cl);
653099ff
GS
651}
652
653STATIC void
097eb12c 654S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 655{
8ecf7187 656 Zero(cl, 1, struct regnode_charclass_class);
653099ff 657 cl->type = ANYOF;
830247a4 658 cl_anything(pRExC_state, cl);
653099ff
GS
659 if (LOC)
660 cl->flags |= ANYOF_LOCALE;
661}
662
663/* 'And' a given class with another one. Can create false positives */
664/* We assume that cl is not inverted */
665STATIC void
5f66b61c 666S_cl_and(struct regnode_charclass_class *cl,
a28509cc 667 const struct regnode_charclass_class *and_with)
653099ff 668{
653099ff
GS
669 if (!(and_with->flags & ANYOF_CLASS)
670 && !(cl->flags & ANYOF_CLASS)
671 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
672 && !(and_with->flags & ANYOF_FOLD)
673 && !(cl->flags & ANYOF_FOLD)) {
674 int i;
675
676 if (and_with->flags & ANYOF_INVERT)
677 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
678 cl->bitmap[i] &= ~and_with->bitmap[i];
679 else
680 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
681 cl->bitmap[i] &= and_with->bitmap[i];
682 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
683 if (!(and_with->flags & ANYOF_EOS))
684 cl->flags &= ~ANYOF_EOS;
1aa99e6b 685
14ebb1a2
JH
686 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
687 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
688 cl->flags &= ~ANYOF_UNICODE_ALL;
689 cl->flags |= ANYOF_UNICODE;
690 ARG_SET(cl, ARG(and_with));
691 }
14ebb1a2
JH
692 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
693 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 694 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
695 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
696 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 697 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
698}
699
700/* 'OR' a given class with another one. Can create false positives */
701/* We assume that cl is not inverted */
702STATIC void
097eb12c 703S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 704{
653099ff
GS
705 if (or_with->flags & ANYOF_INVERT) {
706 /* We do not use
707 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
708 * <= (B1 | !B2) | (CL1 | !CL2)
709 * which is wasteful if CL2 is small, but we ignore CL2:
710 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
711 * XXXX Can we handle case-fold? Unclear:
712 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
713 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
714 */
715 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
716 && !(or_with->flags & ANYOF_FOLD)
717 && !(cl->flags & ANYOF_FOLD) ) {
718 int i;
719
720 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
721 cl->bitmap[i] |= ~or_with->bitmap[i];
722 } /* XXXX: logic is complicated otherwise */
723 else {
830247a4 724 cl_anything(pRExC_state, cl);
653099ff
GS
725 }
726 } else {
727 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
728 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 729 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
730 || (cl->flags & ANYOF_FOLD)) ) {
731 int i;
732
733 /* OR char bitmap and class bitmap separately */
734 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
735 cl->bitmap[i] |= or_with->bitmap[i];
736 if (or_with->flags & ANYOF_CLASS) {
737 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
738 cl->classflags[i] |= or_with->classflags[i];
739 cl->flags |= ANYOF_CLASS;
740 }
741 }
742 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 743 cl_anything(pRExC_state, cl);
653099ff
GS
744 }
745 }
746 if (or_with->flags & ANYOF_EOS)
747 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
748
749 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
750 ARG(cl) != ARG(or_with)) {
751 cl->flags |= ANYOF_UNICODE_ALL;
752 cl->flags &= ~ANYOF_UNICODE;
753 }
754 if (or_with->flags & ANYOF_UNICODE_ALL) {
755 cl->flags |= ANYOF_UNICODE_ALL;
756 cl->flags &= ~ANYOF_UNICODE;
757 }
653099ff
GS
758}
759
a3621e74
YO
760#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
761#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
762#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
763#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
764
3dab1dad
YO
765
766#ifdef DEBUGGING
07be1b83 767/*
3dab1dad
YO
768 dump_trie(trie)
769 dump_trie_interim_list(trie,next_alloc)
770 dump_trie_interim_table(trie,next_alloc)
771
772 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
773 The _interim_ variants are used for debugging the interim
774 tables that are used to generate the final compressed
775 representation which is what dump_trie expects.
776
3dab1dad
YO
777 Part of the reason for their existance is to provide a form
778 of documentation as to how the different representations function.
07be1b83
YO
779
780*/
3dab1dad
YO
781
782/*
783 dump_trie(trie)
784 Dumps the final compressed table form of the trie to Perl_debug_log.
785 Used for debugging make_trie().
786*/
787
788STATIC void
789S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
790{
791 U32 state;
ab3bbdeb
YO
792 SV *sv=sv_newmortal();
793 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
794 GET_RE_DEBUG_FLAGS_DECL;
795
ab3bbdeb 796
3dab1dad
YO
797 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
798 (int)depth * 2 + 2,"",
799 "Match","Base","Ofs" );
800
801 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 802 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 803 if ( tmp ) {
ab3bbdeb
YO
804 PerlIO_printf( Perl_debug_log, "%*s",
805 colwidth,
ddc5bc0f 806 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
807 PL_colors[0], PL_colors[1],
808 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
809 PERL_PV_ESCAPE_FIRSTCHAR
810 )
811 );
3dab1dad
YO
812 }
813 }
814 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
815 (int)depth * 2 + 2,"");
816
817 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 818 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
819 PerlIO_printf( Perl_debug_log, "\n");
820
786e8c11 821 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 822 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
823
824 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
825
826 if ( trie->states[ state ].wordnum ) {
827 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
828 } else {
829 PerlIO_printf( Perl_debug_log, "%6s", "" );
830 }
831
832 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
833
834 if ( base ) {
835 U32 ofs = 0;
836
837 while( ( base + ofs < trie->uniquecharcount ) ||
838 ( base + ofs - trie->uniquecharcount < trie->lasttrans
839 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
840 ofs++;
841
842 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
843
844 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
845 if ( ( base + ofs >= trie->uniquecharcount ) &&
846 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
847 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
848 {
ab3bbdeb
YO
849 PerlIO_printf( Perl_debug_log, "%*"UVXf,
850 colwidth,
3dab1dad
YO
851 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
852 } else {
ab3bbdeb 853 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
854 }
855 }
856
857 PerlIO_printf( Perl_debug_log, "]");
858
859 }
860 PerlIO_printf( Perl_debug_log, "\n" );
861 }
862}
863/*
864 dump_trie_interim_list(trie,next_alloc)
865 Dumps a fully constructed but uncompressed trie in list form.
866 List tries normally only are used for construction when the number of
867 possible chars (trie->uniquecharcount) is very high.
868 Used for debugging make_trie().
869*/
870STATIC void
871S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
872{
873 U32 state;
ab3bbdeb
YO
874 SV *sv=sv_newmortal();
875 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
876 GET_RE_DEBUG_FLAGS_DECL;
877 /* print out the table precompression. */
ab3bbdeb
YO
878 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
879 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
880 "------:-----+-----------------\n" );
3dab1dad
YO
881
882 for( state=1 ; state < next_alloc ; state ++ ) {
883 U16 charid;
884
ab3bbdeb 885 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
886 (int)depth * 2 + 2,"", (UV)state );
887 if ( ! trie->states[ state ].wordnum ) {
888 PerlIO_printf( Perl_debug_log, "%5s| ","");
889 } else {
890 PerlIO_printf( Perl_debug_log, "W%4x| ",
891 trie->states[ state ].wordnum
892 );
893 }
894 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 895 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
896 if ( tmp ) {
897 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
898 colwidth,
ddc5bc0f 899 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
900 PL_colors[0], PL_colors[1],
901 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
902 PERL_PV_ESCAPE_FIRSTCHAR
903 ) ,
3dab1dad
YO
904 TRIE_LIST_ITEM(state,charid).forid,
905 (UV)TRIE_LIST_ITEM(state,charid).newstate
906 );
907 }
ab3bbdeb
YO
908 }
909 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
910 }
911}
912
913/*
914 dump_trie_interim_table(trie,next_alloc)
915 Dumps a fully constructed but uncompressed trie in table form.
916 This is the normal DFA style state transition table, with a few
917 twists to facilitate compression later.
918 Used for debugging make_trie().
919*/
920STATIC void
921S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
922{
923 U32 state;
924 U16 charid;
ab3bbdeb
YO
925 SV *sv=sv_newmortal();
926 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
927 GET_RE_DEBUG_FLAGS_DECL;
928
929 /*
930 print out the table precompression so that we can do a visual check
931 that they are identical.
932 */
933
934 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
935
936 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 937 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 938 if ( tmp ) {
ab3bbdeb
YO
939 PerlIO_printf( Perl_debug_log, "%*s",
940 colwidth,
ddc5bc0f 941 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
942 PL_colors[0], PL_colors[1],
943 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
944 PERL_PV_ESCAPE_FIRSTCHAR
945 )
946 );
3dab1dad
YO
947 }
948 }
949
950 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
951
952 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 953 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
954 }
955
956 PerlIO_printf( Perl_debug_log, "\n" );
957
958 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
959
960 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
961 (int)depth * 2 + 2,"",
962 (UV)TRIE_NODENUM( state ) );
963
964 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
965 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
966 if (v)
967 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
968 else
969 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
970 }
971 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
972 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
973 } else {
974 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
975 trie->states[ TRIE_NODENUM( state ) ].wordnum );
976 }
977 }
07be1b83 978}
3dab1dad
YO
979
980#endif
981
786e8c11
YO
982/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
983 startbranch: the first branch in the whole branch sequence
984 first : start branch of sequence of branch-exact nodes.
985 May be the same as startbranch
986 last : Thing following the last branch.
987 May be the same as tail.
988 tail : item following the branch sequence
989 count : words in the sequence
990 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
991 depth : indent depth
3dab1dad 992
786e8c11 993Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 994
786e8c11
YO
995A trie is an N'ary tree where the branches are determined by digital
996decomposition of the key. IE, at the root node you look up the 1st character and
997follow that branch repeat until you find the end of the branches. Nodes can be
998marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 999
786e8c11 1000 /he|she|his|hers/
72f13be8 1001
786e8c11
YO
1002would convert into the following structure. Numbers represent states, letters
1003following numbers represent valid transitions on the letter from that state, if
1004the number is in square brackets it represents an accepting state, otherwise it
1005will be in parenthesis.
07be1b83 1006
786e8c11
YO
1007 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1008 | |
1009 | (2)
1010 | |
1011 (1) +-i->(6)-+-s->[7]
1012 |
1013 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1014
786e8c11
YO
1015 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1016
1017This shows that when matching against the string 'hers' we will begin at state 1
1018read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1019then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1020is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1021single traverse. We store a mapping from accepting to state to which word was
1022matched, and then when we have multiple possibilities we try to complete the
1023rest of the regex in the order in which they occured in the alternation.
1024
1025The only prior NFA like behaviour that would be changed by the TRIE support is
1026the silent ignoring of duplicate alternations which are of the form:
1027
1028 / (DUPE|DUPE) X? (?{ ... }) Y /x
1029
1030Thus EVAL blocks follwing a trie may be called a different number of times with
1031and without the optimisation. With the optimisations dupes will be silently
1032ignored. This inconsistant behaviour of EVAL type nodes is well established as
1033the following demonstrates:
1034
1035 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1036
1037which prints out 'word' three times, but
1038
1039 'words'=~/(word|word|word)(?{ print $1 })S/
1040
1041which doesnt print it out at all. This is due to other optimisations kicking in.
1042
1043Example of what happens on a structural level:
1044
1045The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1046
1047 1: CURLYM[1] {1,32767}(18)
1048 5: BRANCH(8)
1049 6: EXACT <ac>(16)
1050 8: BRANCH(11)
1051 9: EXACT <ad>(16)
1052 11: BRANCH(14)
1053 12: EXACT <ab>(16)
1054 16: SUCCEED(0)
1055 17: NOTHING(18)
1056 18: END(0)
1057
1058This would be optimizable with startbranch=5, first=5, last=16, tail=16
1059and should turn into:
1060
1061 1: CURLYM[1] {1,32767}(18)
1062 5: TRIE(16)
1063 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1064 <ac>
1065 <ad>
1066 <ab>
1067 16: SUCCEED(0)
1068 17: NOTHING(18)
1069 18: END(0)
1070
1071Cases where tail != last would be like /(?foo|bar)baz/:
1072
1073 1: BRANCH(4)
1074 2: EXACT <foo>(8)
1075 4: BRANCH(7)
1076 5: EXACT <bar>(8)
1077 7: TAIL(8)
1078 8: EXACT <baz>(10)
1079 10: END(0)
1080
1081which would be optimizable with startbranch=1, first=1, last=7, tail=8
1082and would end up looking like:
1083
1084 1: TRIE(8)
1085 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1086 <foo>
1087 <bar>
1088 7: TAIL(8)
1089 8: EXACT <baz>(10)
1090 10: END(0)
1091
1092 d = uvuni_to_utf8_flags(d, uv, 0);
1093
1094is the recommended Unicode-aware way of saying
1095
1096 *(d++) = uv;
1097*/
1098
1099#define TRIE_STORE_REVCHAR \
1100 STMT_START { \
1101 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1102 if (UTF) SvUTF8_on(tmp); \
1103 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1104 } STMT_END
1105
1106#define TRIE_READ_CHAR STMT_START { \
1107 wordlen++; \
1108 if ( UTF ) { \
1109 if ( folder ) { \
1110 if ( foldlen > 0 ) { \
1111 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1112 foldlen -= len; \
1113 scan += len; \
1114 len = 0; \
1115 } else { \
1116 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1117 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1118 foldlen -= UNISKIP( uvc ); \
1119 scan = foldbuf + UNISKIP( uvc ); \
1120 } \
1121 } else { \
1122 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1123 } \
1124 } else { \
1125 uvc = (U32)*uc; \
1126 len = 1; \
1127 } \
1128} STMT_END
1129
1130
1131
1132#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1133 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1134 TRIE_LIST_LEN( state ) *= 2; \
1135 Renew( trie->states[ state ].trans.list, \
1136 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1137 } \
1138 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1139 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1140 TRIE_LIST_CUR( state )++; \
1141} STMT_END
07be1b83 1142
786e8c11
YO
1143#define TRIE_LIST_NEW(state) STMT_START { \
1144 Newxz( trie->states[ state ].trans.list, \
1145 4, reg_trie_trans_le ); \
1146 TRIE_LIST_CUR( state ) = 1; \
1147 TRIE_LIST_LEN( state ) = 4; \
1148} STMT_END
07be1b83 1149
786e8c11
YO
1150#define TRIE_HANDLE_WORD(state) STMT_START { \
1151 U16 dupe= trie->states[ state ].wordnum; \
1152 regnode * const noper_next = regnext( noper ); \
1153 \
1154 if (trie->wordlen) \
1155 trie->wordlen[ curword ] = wordlen; \
1156 DEBUG_r({ \
1157 /* store the word for dumping */ \
1158 SV* tmp; \
1159 if (OP(noper) != NOTHING) \
1160 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1161 else \
1162 tmp = newSVpvn( "", 0 ); \
1163 if ( UTF ) SvUTF8_on( tmp ); \
1164 av_push( trie->words, tmp ); \
1165 }); \
1166 \
1167 curword++; \
1168 \
1169 if ( noper_next < tail ) { \
1170 if (!trie->jump) \
1171 Newxz( trie->jump, word_count + 1, U16); \
1172 trie->jump[curword] = (U16)(tail - noper_next); \
1173 if (!jumper) \
1174 jumper = noper_next; \
1175 if (!nextbranch) \
1176 nextbranch= regnext(cur); \
1177 } \
1178 \
1179 if ( dupe ) { \
1180 /* So it's a dupe. This means we need to maintain a */\
1181 /* linked-list from the first to the next. */\
1182 /* we only allocate the nextword buffer when there */\
1183 /* a dupe, so first time we have to do the allocation */\
1184 if (!trie->nextword) \
1185 Newxz( trie->nextword, word_count + 1, U16); \
1186 while ( trie->nextword[dupe] ) \
1187 dupe= trie->nextword[dupe]; \
1188 trie->nextword[dupe]= curword; \
1189 } else { \
1190 /* we haven't inserted this word yet. */ \
1191 trie->states[ state ].wordnum = curword; \
1192 } \
1193} STMT_END
07be1b83 1194
3dab1dad 1195
786e8c11
YO
1196#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1197 ( ( base + charid >= ucharcount \
1198 && base + charid < ubound \
1199 && state == trie->trans[ base - ucharcount + charid ].check \
1200 && trie->trans[ base - ucharcount + charid ].next ) \
1201 ? trie->trans[ base - ucharcount + charid ].next \
1202 : ( state==1 ? special : 0 ) \
1203 )
3dab1dad 1204
786e8c11
YO
1205#define MADE_TRIE 1
1206#define MADE_JUMP_TRIE 2
1207#define MADE_EXACT_TRIE 4
3dab1dad 1208
a3621e74 1209STATIC I32
786e8c11 1210S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1211{
27da23d5 1212 dVAR;
a3621e74
YO
1213 /* first pass, loop through and scan words */
1214 reg_trie_data *trie;
1215 regnode *cur;
9f7f3913 1216 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1217 STRLEN len = 0;
1218 UV uvc = 0;
1219 U16 curword = 0;
1220 U32 next_alloc = 0;
786e8c11
YO
1221 regnode *jumper = NULL;
1222 regnode *nextbranch = NULL;
a3621e74 1223 /* we just use folder as a flag in utf8 */
e1ec3a88 1224 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1225 ? PL_fold
1226 : ( flags == EXACTFL
1227 ? PL_fold_locale
1228 : NULL
1229 )
1230 );
1231
e1ec3a88 1232 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1233 SV *re_trie_maxbuff;
3dab1dad
YO
1234#ifndef DEBUGGING
1235 /* these are only used during construction but are useful during
8e11feef 1236 * debugging so we store them in the struct when debugging.
8e11feef 1237 */
3dab1dad 1238 STRLEN trie_charcount=0;
3dab1dad
YO
1239 AV *trie_revcharmap;
1240#endif
a3621e74 1241 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1242#ifndef DEBUGGING
1243 PERL_UNUSED_ARG(depth);
1244#endif
a3621e74 1245
a02a5408 1246 Newxz( trie, 1, reg_trie_data );
a3621e74 1247 trie->refcount = 1;
3dab1dad 1248 trie->startstate = 1;
786e8c11 1249 trie->wordcount = word_count;
a3621e74 1250 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1251 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1252 if (!(UTF && folder))
1253 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1254 DEBUG_r({
1255 trie->words = newAV();
a3621e74 1256 });
3dab1dad 1257 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1258
0111c4fd 1259 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1260 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1261 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1262 }
3dab1dad
YO
1263 DEBUG_OPTIMISE_r({
1264 PerlIO_printf( Perl_debug_log,
786e8c11 1265 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1266 (int)depth * 2 + 2, "",
1267 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1268 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1269 (int)depth);
3dab1dad 1270 });
a3621e74
YO
1271 /* -- First loop and Setup --
1272
1273 We first traverse the branches and scan each word to determine if it
1274 contains widechars, and how many unique chars there are, this is
1275 important as we have to build a table with at least as many columns as we
1276 have unique chars.
1277
1278 We use an array of integers to represent the character codes 0..255
1279 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1280 native representation of the character value as the key and IV's for the
1281 coded index.
1282
1283 *TODO* If we keep track of how many times each character is used we can
1284 remap the columns so that the table compression later on is more
1285 efficient in terms of memory by ensuring most common value is in the
1286 middle and the least common are on the outside. IMO this would be better
1287 than a most to least common mapping as theres a decent chance the most
1288 common letter will share a node with the least common, meaning the node
1289 will not be compressable. With a middle is most common approach the worst
1290 case is when we have the least common nodes twice.
1291
1292 */
1293
a3621e74 1294 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1295 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1296 const U8 *uc = (U8*)STRING( noper );
a28509cc 1297 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1298 STRLEN foldlen = 0;
1299 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1300 const U8 *scan = (U8*)NULL;
07be1b83 1301 U32 wordlen = 0; /* required init */
3dab1dad 1302 STRLEN chars=0;
a3621e74 1303
3dab1dad
YO
1304 if (OP(noper) == NOTHING) {
1305 trie->minlen= 0;
1306 continue;
1307 }
1308 if (trie->bitmap) {
1309 TRIE_BITMAP_SET(trie,*uc);
1310 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1311 }
a3621e74 1312 for ( ; uc < e ; uc += len ) {
3dab1dad 1313 TRIE_CHARCOUNT(trie)++;
a3621e74 1314 TRIE_READ_CHAR;
3dab1dad 1315 chars++;
a3621e74
YO
1316 if ( uvc < 256 ) {
1317 if ( !trie->charmap[ uvc ] ) {
1318 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1319 if ( folder )
1320 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1321 TRIE_STORE_REVCHAR;
a3621e74
YO
1322 }
1323 } else {
1324 SV** svpp;
1325 if ( !trie->widecharmap )
1326 trie->widecharmap = newHV();
1327
1328 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1329
1330 if ( !svpp )
e4584336 1331 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1332
1333 if ( !SvTRUE( *svpp ) ) {
1334 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1335 TRIE_STORE_REVCHAR;
a3621e74
YO
1336 }
1337 }
1338 }
3dab1dad
YO
1339 if( cur == first ) {
1340 trie->minlen=chars;
1341 trie->maxlen=chars;
1342 } else if (chars < trie->minlen) {
1343 trie->minlen=chars;
1344 } else if (chars > trie->maxlen) {
1345 trie->maxlen=chars;
1346 }
1347
a3621e74
YO
1348 } /* end first pass */
1349 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1350 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1351 (int)depth * 2 + 2,"",
85c3142d 1352 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1353 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1354 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1355 );
786e8c11 1356 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1357
1358 /*
1359 We now know what we are dealing with in terms of unique chars and
1360 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1361 representation using a flat table will take. If it's over a reasonable
1362 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1363 conservative but potentially much slower representation using an array
1364 of lists.
1365
1366 At the end we convert both representations into the same compressed
1367 form that will be used in regexec.c for matching with. The latter
1368 is a form that cannot be used to construct with but has memory
1369 properties similar to the list form and access properties similar
1370 to the table form making it both suitable for fast searches and
1371 small enough that its feasable to store for the duration of a program.
1372
1373 See the comment in the code where the compressed table is produced
1374 inplace from the flat tabe representation for an explanation of how
1375 the compression works.
1376
1377 */
1378
1379
3dab1dad 1380 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1381 /*
1382 Second Pass -- Array Of Lists Representation
1383
1384 Each state will be represented by a list of charid:state records
1385 (reg_trie_trans_le) the first such element holds the CUR and LEN
1386 points of the allocated array. (See defines above).
1387
1388 We build the initial structure using the lists, and then convert
1389 it into the compressed table form which allows faster lookups
1390 (but cant be modified once converted).
a3621e74
YO
1391 */
1392
a3621e74
YO
1393 STRLEN transcount = 1;
1394
3dab1dad 1395 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1396 TRIE_LIST_NEW(1);
1397 next_alloc = 2;
1398
1399 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1400
c445ea15
AL
1401 regnode * const noper = NEXTOPER( cur );
1402 U8 *uc = (U8*)STRING( noper );
1403 const U8 * const e = uc + STR_LEN( noper );
1404 U32 state = 1; /* required init */
1405 U16 charid = 0; /* sanity init */
1406 U8 *scan = (U8*)NULL; /* sanity init */
1407 STRLEN foldlen = 0; /* required init */
07be1b83 1408 U32 wordlen = 0; /* required init */
c445ea15
AL
1409 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1410
3dab1dad 1411 if (OP(noper) != NOTHING) {
786e8c11 1412 for ( ; uc < e ; uc += len ) {
c445ea15 1413
786e8c11 1414 TRIE_READ_CHAR;
c445ea15 1415
786e8c11
YO
1416 if ( uvc < 256 ) {
1417 charid = trie->charmap[ uvc ];
c445ea15 1418 } else {
786e8c11
YO
1419 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1420 if ( !svpp ) {
1421 charid = 0;
1422 } else {
1423 charid=(U16)SvIV( *svpp );
1424 }
c445ea15 1425 }
786e8c11
YO
1426 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1427 if ( charid ) {
a3621e74 1428
786e8c11
YO
1429 U16 check;
1430 U32 newstate = 0;
a3621e74 1431
786e8c11
YO
1432 charid--;
1433 if ( !trie->states[ state ].trans.list ) {
1434 TRIE_LIST_NEW( state );
c445ea15 1435 }
786e8c11
YO
1436 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1437 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1438 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1439 break;
1440 }
1441 }
1442 if ( ! newstate ) {
1443 newstate = next_alloc++;
1444 TRIE_LIST_PUSH( state, charid, newstate );
1445 transcount++;
1446 }
1447 state = newstate;
1448 } else {
1449 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1450 }
a28509cc 1451 }
c445ea15 1452 }
3dab1dad 1453 TRIE_HANDLE_WORD(state);
a3621e74
YO
1454
1455 } /* end second pass */
1456
786e8c11 1457 trie->laststate = next_alloc;
a3621e74
YO
1458 Renew( trie->states, next_alloc, reg_trie_state );
1459
3dab1dad
YO
1460 /* and now dump it out before we compress it */
1461 DEBUG_TRIE_COMPILE_MORE_r(
1462 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1463 );
a3621e74 1464
a02a5408 1465 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1466 {
1467 U32 state;
a3621e74
YO
1468 U32 tp = 0;
1469 U32 zp = 0;
1470
1471
1472 for( state=1 ; state < next_alloc ; state ++ ) {
1473 U32 base=0;
1474
1475 /*
1476 DEBUG_TRIE_COMPILE_MORE_r(
1477 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1478 );
1479 */
1480
1481 if (trie->states[state].trans.list) {
1482 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1483 U16 maxid=minid;
a28509cc 1484 U16 idx;
a3621e74
YO
1485
1486 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1487 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1488 if ( forid < minid ) {
1489 minid=forid;
1490 } else if ( forid > maxid ) {
1491 maxid=forid;
1492 }
a3621e74
YO
1493 }
1494 if ( transcount < tp + maxid - minid + 1) {
1495 transcount *= 2;
1496 Renew( trie->trans, transcount, reg_trie_trans );
1497 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1498 }
1499 base = trie->uniquecharcount + tp - minid;
1500 if ( maxid == minid ) {
1501 U32 set = 0;
1502 for ( ; zp < tp ; zp++ ) {
1503 if ( ! trie->trans[ zp ].next ) {
1504 base = trie->uniquecharcount + zp - minid;
1505 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1506 trie->trans[ zp ].check = state;
1507 set = 1;
1508 break;
1509 }
1510 }
1511 if ( !set ) {
1512 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1513 trie->trans[ tp ].check = state;
1514 tp++;
1515 zp = tp;
1516 }
1517 } else {
1518 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1519 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1520 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1521 trie->trans[ tid ].check = state;
1522 }
1523 tp += ( maxid - minid + 1 );
1524 }
1525 Safefree(trie->states[ state ].trans.list);
1526 }
1527 /*
1528 DEBUG_TRIE_COMPILE_MORE_r(
1529 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1530 );
1531 */
1532 trie->states[ state ].trans.base=base;
1533 }
cc601c31 1534 trie->lasttrans = tp + 1;
a3621e74
YO
1535 }
1536 } else {
1537 /*
1538 Second Pass -- Flat Table Representation.
1539
1540 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1541 We know that we will need Charcount+1 trans at most to store the data
1542 (one row per char at worst case) So we preallocate both structures
1543 assuming worst case.
1544
1545 We then construct the trie using only the .next slots of the entry
1546 structs.
1547
1548 We use the .check field of the first entry of the node temporarily to
1549 make compression both faster and easier by keeping track of how many non
1550 zero fields are in the node.
1551
1552 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1553 transition.
1554
1555 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1556 number representing the first entry of the node, and state as a
1557 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1558 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1559 are 2 entrys per node. eg:
1560
1561 A B A B
1562 1. 2 4 1. 3 7
1563 2. 0 3 3. 0 5
1564 3. 0 0 5. 0 0
1565 4. 0 0 7. 0 0
1566
1567 The table is internally in the right hand, idx form. However as we also
1568 have to deal with the states array which is indexed by nodenum we have to
1569 use TRIE_NODENUM() to convert.
1570
1571 */
1572
3dab1dad
YO
1573
1574 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1575 reg_trie_trans );
3dab1dad 1576 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1577 next_alloc = trie->uniquecharcount + 1;
1578
3dab1dad 1579
a3621e74
YO
1580 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1581
c445ea15 1582 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1583 const U8 *uc = (U8*)STRING( noper );
1584 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1585
1586 U32 state = 1; /* required init */
1587
1588 U16 charid = 0; /* sanity init */
1589 U32 accept_state = 0; /* sanity init */
1590 U8 *scan = (U8*)NULL; /* sanity init */
1591
1592 STRLEN foldlen = 0; /* required init */
07be1b83 1593 U32 wordlen = 0; /* required init */
a3621e74
YO
1594 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1595
3dab1dad 1596 if ( OP(noper) != NOTHING ) {
786e8c11 1597 for ( ; uc < e ; uc += len ) {
a3621e74 1598
786e8c11 1599 TRIE_READ_CHAR;
a3621e74 1600
786e8c11
YO
1601 if ( uvc < 256 ) {
1602 charid = trie->charmap[ uvc ];
1603 } else {
1604 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1605 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1606 }
786e8c11
YO
1607 if ( charid ) {
1608 charid--;
1609 if ( !trie->trans[ state + charid ].next ) {
1610 trie->trans[ state + charid ].next = next_alloc;
1611 trie->trans[ state ].check++;
1612 next_alloc += trie->uniquecharcount;
1613 }
1614 state = trie->trans[ state + charid ].next;
1615 } else {
1616 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1617 }
1618 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1619 }
a3621e74 1620 }
3dab1dad
YO
1621 accept_state = TRIE_NODENUM( state );
1622 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1623
1624 } /* end second pass */
1625
3dab1dad
YO
1626 /* and now dump it out before we compress it */
1627 DEBUG_TRIE_COMPILE_MORE_r(
1628 dump_trie_interim_table(trie,next_alloc,depth+1)
1629 );
a3621e74 1630
a3621e74
YO
1631 {
1632 /*
1633 * Inplace compress the table.*
1634
1635 For sparse data sets the table constructed by the trie algorithm will
1636 be mostly 0/FAIL transitions or to put it another way mostly empty.
1637 (Note that leaf nodes will not contain any transitions.)
1638
1639 This algorithm compresses the tables by eliminating most such
1640 transitions, at the cost of a modest bit of extra work during lookup:
1641
1642 - Each states[] entry contains a .base field which indicates the
1643 index in the state[] array wheres its transition data is stored.
1644
1645 - If .base is 0 there are no valid transitions from that node.
1646
1647 - If .base is nonzero then charid is added to it to find an entry in
1648 the trans array.
1649
1650 -If trans[states[state].base+charid].check!=state then the
1651 transition is taken to be a 0/Fail transition. Thus if there are fail
1652 transitions at the front of the node then the .base offset will point
1653 somewhere inside the previous nodes data (or maybe even into a node
1654 even earlier), but the .check field determines if the transition is
1655 valid.
1656
786e8c11 1657 XXX - wrong maybe?
a3621e74
YO
1658 The following process inplace converts the table to the compressed
1659 table: We first do not compress the root node 1,and mark its all its
1660 .check pointers as 1 and set its .base pointer as 1 as well. This
1661 allows to do a DFA construction from the compressed table later, and
1662 ensures that any .base pointers we calculate later are greater than
1663 0.
1664
1665 - We set 'pos' to indicate the first entry of the second node.
1666
1667 - We then iterate over the columns of the node, finding the first and
1668 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1669 and set the .check pointers accordingly, and advance pos
1670 appropriately and repreat for the next node. Note that when we copy
1671 the next pointers we have to convert them from the original
1672 NODEIDX form to NODENUM form as the former is not valid post
1673 compression.
1674
1675 - If a node has no transitions used we mark its base as 0 and do not
1676 advance the pos pointer.
1677
1678 - If a node only has one transition we use a second pointer into the
1679 structure to fill in allocated fail transitions from other states.
1680 This pointer is independent of the main pointer and scans forward
1681 looking for null transitions that are allocated to a state. When it
1682 finds one it writes the single transition into the "hole". If the
786e8c11 1683 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1684
1685 - Once compressed we can Renew/realloc the structures to release the
1686 excess space.
1687
1688 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1689 specifically Fig 3.47 and the associated pseudocode.
1690
1691 demq
1692 */
a3b680e6 1693 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1694 U32 state, charid;
a3621e74 1695 U32 pos = 0, zp=0;
786e8c11 1696 trie->laststate = laststate;
a3621e74
YO
1697
1698 for ( state = 1 ; state < laststate ; state++ ) {
1699 U8 flag = 0;
a28509cc
AL
1700 const U32 stateidx = TRIE_NODEIDX( state );
1701 const U32 o_used = trie->trans[ stateidx ].check;
1702 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1703 trie->trans[ stateidx ].check = 0;
1704
1705 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1706 if ( flag || trie->trans[ stateidx + charid ].next ) {
1707 if ( trie->trans[ stateidx + charid ].next ) {
1708 if (o_used == 1) {
1709 for ( ; zp < pos ; zp++ ) {
1710 if ( ! trie->trans[ zp ].next ) {
1711 break;
1712 }
1713 }
1714 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1715 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1716 trie->trans[ zp ].check = state;
1717 if ( ++zp > pos ) pos = zp;
1718 break;
1719 }
1720 used--;
1721 }
1722 if ( !flag ) {
1723 flag = 1;
1724 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1725 }
1726 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1727 trie->trans[ pos ].check = state;
1728 pos++;
1729 }
1730 }
1731 }
cc601c31 1732 trie->lasttrans = pos + 1;
a3621e74
YO
1733 Renew( trie->states, laststate + 1, reg_trie_state);
1734 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1735 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1736 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1737 (int)depth * 2 + 2,"",
1738 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1739 (IV)next_alloc,
1740 (IV)pos,
a3621e74
YO
1741 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1742 );
1743
1744 } /* end table compress */
1745 }
cc601c31
YO
1746 /* resize the trans array to remove unused space */
1747 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1748
3dab1dad
YO
1749 /* and now dump out the compressed format */
1750 DEBUG_TRIE_COMPILE_r(
1751 dump_trie(trie,depth+1)
1752 );
07be1b83 1753
3dab1dad
YO
1754 { /* Modify the program and insert the new TRIE node*/
1755 regnode *convert;
1756 U8 nodetype =(U8)(flags & 0xFF);
1757 char *str=NULL;
786e8c11 1758
07be1b83 1759#ifdef DEBUGGING
a5ca303d 1760 regnode *optimize;
b57a0404
JH
1761 U32 mjd_offset = 0;
1762 U32 mjd_nodelen = 0;
07be1b83 1763#endif
a3621e74 1764 /*
3dab1dad
YO
1765 This means we convert either the first branch or the first Exact,
1766 depending on whether the thing following (in 'last') is a branch
1767 or not and whther first is the startbranch (ie is it a sub part of
1768 the alternation or is it the whole thing.)
1769 Assuming its a sub part we conver the EXACT otherwise we convert
1770 the whole branch sequence, including the first.
a3621e74 1771 */
3dab1dad
YO
1772 /* Find the node we are going to overwrite */
1773 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1774 /* whole branch chain */
3dab1dad 1775 convert = first;
07be1b83
YO
1776 DEBUG_r({
1777 const regnode *nop = NEXTOPER( convert );
1778 mjd_offset= Node_Offset((nop));
1779 mjd_nodelen= Node_Length((nop));
1780 });
1781 } else {
1782 /* branch sub-chain */
3dab1dad
YO
1783 convert = NEXTOPER( first );
1784 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1785 DEBUG_r({
1786 mjd_offset= Node_Offset((convert));
1787 mjd_nodelen= Node_Length((convert));
1788 });
1789 }
1790 DEBUG_OPTIMISE_r(
1791 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1792 (int)depth * 2 + 2, "",
786e8c11 1793 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1794 );
a3621e74 1795
3dab1dad
YO
1796 /* But first we check to see if there is a common prefix we can
1797 split out as an EXACT and put in front of the TRIE node. */
1798 trie->startstate= 1;
786e8c11 1799 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1800 U32 state;
1801 DEBUG_OPTIMISE_r(
8e11feef
RGS
1802 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1803 (int)depth * 2 + 2, "",
786e8c11 1804 (UV)trie->laststate)
8e11feef 1805 );
786e8c11 1806 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1807 U32 ofs = 0;
8e11feef
RGS
1808 I32 idx = -1;
1809 U32 count = 0;
1810 const U32 base = trie->states[ state ].trans.base;
a3621e74 1811
3dab1dad 1812 if ( trie->states[state].wordnum )
8e11feef 1813 count = 1;
a3621e74 1814
8e11feef 1815 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1816 if ( ( base + ofs >= trie->uniquecharcount ) &&
1817 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1818 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1819 {
3dab1dad 1820 if ( ++count > 1 ) {
8e11feef 1821 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1822 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1823 if ( state == 1 ) break;
3dab1dad
YO
1824 if ( count == 2 ) {
1825 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1826 DEBUG_OPTIMISE_r(
8e11feef
RGS
1827 PerlIO_printf(Perl_debug_log,
1828 "%*sNew Start State=%"UVuf" Class: [",
1829 (int)depth * 2 + 2, "",
786e8c11 1830 (UV)state));
be8e71aa
YO
1831 if (idx >= 0) {
1832 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1833 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1834
3dab1dad 1835 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1836 if ( folder )
1837 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1838 DEBUG_OPTIMISE_r(
07be1b83 1839 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1840 );
8e11feef
RGS
1841 }
1842 }
1843 TRIE_BITMAP_SET(trie,*ch);
1844 if ( folder )
1845 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1846 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1847 }
1848 idx = ofs;
1849 }
3dab1dad
YO
1850 }
1851 if ( count == 1 ) {
1852 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1853 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1854 DEBUG_OPTIMISE_r(
8e11feef
RGS
1855 PerlIO_printf( Perl_debug_log,
1856 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1857 (int)depth * 2 + 2, "",
786e8c11 1858 (UV)state, (UV)idx, ch)
3dab1dad
YO
1859 );
1860 if ( state==1 ) {
1861 OP( convert ) = nodetype;
1862 str=STRING(convert);
1863 STR_LEN(convert)=0;
1864 }
1865 *str++=*ch;
1866 STR_LEN(convert)++;
a3621e74 1867
8e11feef 1868 } else {
f9049ba1 1869#ifdef DEBUGGING
8e11feef
RGS
1870 if (state>1)
1871 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1872#endif
8e11feef
RGS
1873 break;
1874 }
1875 }
3dab1dad 1876 if (str) {
8e11feef 1877 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1878 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1879 trie->startstate = state;
07be1b83
YO
1880 trie->minlen -= (state - 1);
1881 trie->maxlen -= (state - 1);
1882 DEBUG_r({
1883 regnode *fix = convert;
1884 mjd_nodelen++;
1885 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1886 while( ++fix < n ) {
1887 Set_Node_Offset_Length(fix, 0, 0);
1888 }
1889 });
8e11feef
RGS
1890 if (trie->maxlen) {
1891 convert = n;
1892 } else {
3dab1dad 1893 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1894 DEBUG_r(optimize= n);
3dab1dad
YO
1895 }
1896 }
1897 }
a5ca303d
YO
1898 if (!jumper)
1899 jumper = last;
3dab1dad 1900 if ( trie->maxlen ) {
8e11feef
RGS
1901 NEXT_OFF( convert ) = (U16)(tail - convert);
1902 ARG_SET( convert, data_slot );
786e8c11
YO
1903 /* Store the offset to the first unabsorbed branch in
1904 jump[0], which is otherwise unused by the jump logic.
1905 We use this when dumping a trie and during optimisation. */
1906 if (trie->jump)
1907 trie->jump[0] = (U16)(tail - nextbranch);
a5ca303d 1908
786e8c11
YO
1909 /* XXXX */
1910 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1911 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1912 {
1913 OP( convert ) = TRIEC;
1914 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1915 Safefree(trie->bitmap);
1916 trie->bitmap= NULL;
1917 } else
1918 OP( convert ) = TRIE;
a3621e74 1919
3dab1dad
YO
1920 /* store the type in the flags */
1921 convert->flags = nodetype;
a5ca303d
YO
1922 DEBUG_r({
1923 optimize = convert
1924 + NODE_STEP_REGNODE
1925 + regarglen[ OP( convert ) ];
1926 });
1927 /* XXX We really should free up the resource in trie now,
1928 as we won't use them - (which resources?) dmq */
3dab1dad 1929 }
a3621e74
YO
1930 /* needed for dumping*/
1931 DEBUG_r({
07be1b83
YO
1932 regnode *opt = convert;
1933 while (++opt<optimize) {
1934 Set_Node_Offset_Length(opt,0,0);
1935 }
786e8c11
YO
1936 /*
1937 Try to clean up some of the debris left after the
1938 optimisation.
a3621e74 1939 */
786e8c11 1940 while( optimize < jumper ) {
07be1b83 1941 mjd_nodelen += Node_Length((optimize));
a3621e74 1942 OP( optimize ) = OPTIMIZED;
07be1b83 1943 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1944 optimize++;
1945 }
07be1b83 1946 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1947 });
1948 } /* end node insert */
07be1b83 1949#ifndef DEBUGGING
6e8b4190 1950 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1951#endif
786e8c11
YO
1952 return trie->jump
1953 ? MADE_JUMP_TRIE
1954 : trie->startstate>1
1955 ? MADE_EXACT_TRIE
1956 : MADE_TRIE;
1957}
1958
1959STATIC void
1960S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1961{
1962/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1963
1964 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1965 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1966 ISBN 0-201-10088-6
1967
1968 We find the fail state for each state in the trie, this state is the longest proper
1969 suffix of the current states 'word' that is also a proper prefix of another word in our
1970 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1971 the DFA not to have to restart after its tried and failed a word at a given point, it
1972 simply continues as though it had been matching the other word in the first place.
1973 Consider
1974 'abcdgu'=~/abcdefg|cdgu/
1975 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1976 fail, which would bring use to the state representing 'd' in the second word where we would
1977 try 'g' and succeed, prodceding to match 'cdgu'.
1978 */
1979 /* add a fail transition */
1980 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1981 U32 *q;
1982 const U32 ucharcount = trie->uniquecharcount;
1983 const U32 numstates = trie->laststate;
1984 const U32 ubound = trie->lasttrans + ucharcount;
1985 U32 q_read = 0;
1986 U32 q_write = 0;
1987 U32 charid;
1988 U32 base = trie->states[ 1 ].trans.base;
1989 U32 *fail;
1990 reg_ac_data *aho;
1991 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1992 GET_RE_DEBUG_FLAGS_DECL;
1993#ifndef DEBUGGING
1994 PERL_UNUSED_ARG(depth);
1995#endif
1996
1997
1998 ARG_SET( stclass, data_slot );
1999 Newxz( aho, 1, reg_ac_data );
2000 RExC_rx->data->data[ data_slot ] = (void*)aho;
2001 aho->trie=trie;
2002 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2003 (trie->laststate+1)*sizeof(reg_trie_state));
2004 Newxz( q, numstates, U32);
2005 Newxz( aho->fail, numstates, U32 );
2006 aho->refcount = 1;
2007 fail = aho->fail;
2008 /* initialize fail[0..1] to be 1 so that we always have
2009 a valid final fail state */
2010 fail[ 0 ] = fail[ 1 ] = 1;
2011
2012 for ( charid = 0; charid < ucharcount ; charid++ ) {
2013 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2014 if ( newstate ) {
2015 q[ q_write ] = newstate;
2016 /* set to point at the root */
2017 fail[ q[ q_write++ ] ]=1;
2018 }
2019 }
2020 while ( q_read < q_write) {
2021 const U32 cur = q[ q_read++ % numstates ];
2022 base = trie->states[ cur ].trans.base;
2023
2024 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2025 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2026 if (ch_state) {
2027 U32 fail_state = cur;
2028 U32 fail_base;
2029 do {
2030 fail_state = fail[ fail_state ];
2031 fail_base = aho->states[ fail_state ].trans.base;
2032 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2033
2034 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2035 fail[ ch_state ] = fail_state;
2036 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2037 {
2038 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2039 }
2040 q[ q_write++ % numstates] = ch_state;
2041 }
2042 }
2043 }
2044 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2045 when we fail in state 1, this allows us to use the
2046 charclass scan to find a valid start char. This is based on the principle
2047 that theres a good chance the string being searched contains lots of stuff
2048 that cant be a start char.
2049 */
2050 fail[ 0 ] = fail[ 1 ] = 0;
2051 DEBUG_TRIE_COMPILE_r({
2052 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2053 for( q_read=1; q_read<numstates; q_read++ ) {
2054 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2055 }
2056 PerlIO_printf(Perl_debug_log, "\n");
2057 });
2058 Safefree(q);
2059 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2060}
2061
786e8c11 2062
a3621e74 2063/*
5d1c421c
JH
2064 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2065 * These need to be revisited when a newer toolchain becomes available.
2066 */
2067#if defined(__sparc64__) && defined(__GNUC__)
2068# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2069# undef SPARC64_GCC_WORKAROUND
2070# define SPARC64_GCC_WORKAROUND 1
2071# endif
2072#endif
2073
07be1b83
YO
2074#define DEBUG_PEEP(str,scan,depth) \
2075 DEBUG_OPTIMISE_r({ \
2076 SV * const mysv=sv_newmortal(); \
2077 regnode *Next = regnext(scan); \
2078 regprop(RExC_rx, mysv, scan); \
2079 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2080 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2081 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2082 });
2083
1de06328
YO
2084
2085
2086
2087
07be1b83
YO
2088#define JOIN_EXACT(scan,min,flags) \
2089 if (PL_regkind[OP(scan)] == EXACT) \
2090 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2091
be8e71aa 2092STATIC U32
07be1b83
YO
2093S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2094 /* Merge several consecutive EXACTish nodes into one. */
2095 regnode *n = regnext(scan);
2096 U32 stringok = 1;
2097 regnode *next = scan + NODE_SZ_STR(scan);
2098 U32 merged = 0;
2099 U32 stopnow = 0;
2100#ifdef DEBUGGING
2101 regnode *stop = scan;
72f13be8 2102 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2103#else
d47053eb
RGS
2104 PERL_UNUSED_ARG(depth);
2105#endif
2106#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2107 PERL_UNUSED_ARG(flags);
2108 PERL_UNUSED_ARG(val);
07be1b83 2109#endif
07be1b83
YO
2110 DEBUG_PEEP("join",scan,depth);
2111
2112 /* Skip NOTHING, merge EXACT*. */
2113 while (n &&
2114 ( PL_regkind[OP(n)] == NOTHING ||
2115 (stringok && (OP(n) == OP(scan))))
2116 && NEXT_OFF(n)
2117 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2118
2119 if (OP(n) == TAIL || n > next)
2120 stringok = 0;
2121 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2122 DEBUG_PEEP("skip:",n,depth);
2123 NEXT_OFF(scan) += NEXT_OFF(n);
2124 next = n + NODE_STEP_REGNODE;
2125#ifdef DEBUGGING
2126 if (stringok)
2127 stop = n;
2128#endif
2129 n = regnext(n);
2130 }
2131 else if (stringok) {
786e8c11 2132 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2133 regnode * const nnext = regnext(n);
2134
2135 DEBUG_PEEP("merg",n,depth);
2136
2137 merged++;
2138 if (oldl + STR_LEN(n) > U8_MAX)
2139 break;
2140 NEXT_OFF(scan) += NEXT_OFF(n);
2141 STR_LEN(scan) += STR_LEN(n);
2142 next = n + NODE_SZ_STR(n);
2143 /* Now we can overwrite *n : */
2144 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2145#ifdef DEBUGGING
2146 stop = next - 1;
2147#endif
2148 n = nnext;
2149 if (stopnow) break;
2150 }
2151
d47053eb
RGS
2152#ifdef EXPERIMENTAL_INPLACESCAN
2153 if (flags && !NEXT_OFF(n)) {
2154 DEBUG_PEEP("atch", val, depth);
2155 if (reg_off_by_arg[OP(n)]) {
2156 ARG_SET(n, val - n);
2157 }
2158 else {
2159 NEXT_OFF(n) = val - n;
2160 }
2161 stopnow = 1;
2162 }
07be1b83
YO
2163#endif
2164 }
2165
2166 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2167 /*
2168 Two problematic code points in Unicode casefolding of EXACT nodes:
2169
2170 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2171 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2172
2173 which casefold to
2174
2175 Unicode UTF-8
2176
2177 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2178 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2179
2180 This means that in case-insensitive matching (or "loose matching",
2181 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2182 length of the above casefolded versions) can match a target string
2183 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2184 This would rather mess up the minimum length computation.
2185
2186 What we'll do is to look for the tail four bytes, and then peek
2187 at the preceding two bytes to see whether we need to decrease
2188 the minimum length by four (six minus two).
2189
2190 Thanks to the design of UTF-8, there cannot be false matches:
2191 A sequence of valid UTF-8 bytes cannot be a subsequence of
2192 another valid sequence of UTF-8 bytes.
2193
2194 */
2195 char * const s0 = STRING(scan), *s, *t;
2196 char * const s1 = s0 + STR_LEN(scan) - 1;
2197 char * const s2 = s1 - 4;
e294cc5d
JH
2198#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2199 const char t0[] = "\xaf\x49\xaf\x42";
2200#else
07be1b83 2201 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2202#endif
07be1b83
YO
2203 const char * const t1 = t0 + 3;
2204
2205 for (s = s0 + 2;
2206 s < s2 && (t = ninstr(s, s1, t0, t1));
2207 s = t + 4) {
e294cc5d
JH
2208#ifdef EBCDIC
2209 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2210 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2211#else
07be1b83
YO
2212 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2213 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2214#endif
07be1b83
YO
2215 *min -= 4;
2216 }
2217 }
2218
2219#ifdef DEBUGGING
2220 /* Allow dumping */
2221 n = scan + NODE_SZ_STR(scan);
2222 while (n <= stop) {
2223 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2224 OP(n) = OPTIMIZED;
2225 NEXT_OFF(n) = 0;
2226 }
2227 n++;
2228 }
2229#endif
2230 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2231 return stopnow;
2232}
2233
653099ff
GS
2234/* REx optimizer. Converts nodes into quickier variants "in place".
2235 Finds fixed substrings. */
2236
a0288114 2237/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2238 to the position after last scanned or to NULL. */
2239
07be1b83
YO
2240
2241
76e3520e 2242STATIC I32
1de06328
YO
2243S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2244 I32 *minlenp, I32 *deltap,
9a957fbc 2245 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2246 /* scanp: Start here (read-write). */
2247 /* deltap: Write maxlen-minlen here. */
2248 /* last: Stop before this one. */
2249{
97aff369 2250 dVAR;
c277df42
IZ
2251 I32 min = 0, pars = 0, code;
2252 regnode *scan = *scanp, *next;
2253 I32 delta = 0;
2254 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2255 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2256 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2257 scan_data_t data_fake;
653099ff 2258 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2259 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2260 regnode *first_non_open = scan;
2261
a3621e74
YO
2262
2263 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2264#ifdef DEBUGGING
2265 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2266#endif
786e8c11
YO
2267 if ( depth == 0 ) {
2268 while (first_non_open && OP(first_non_open) == OPEN)
2269 first_non_open=regnext(first_non_open);
2270 }
2271
b81d288d 2272
c277df42
IZ
2273 while (scan && OP(scan) != END && scan < last) {
2274 /* Peephole optimizer: */
1de06328 2275 DEBUG_STUDYDATA(data,depth);
07be1b83 2276 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2277 JOIN_EXACT(scan,&min,0);
a3621e74 2278
653099ff
GS
2279 /* Follow the next-chain of the current node and optimize
2280 away all the NOTHINGs from it. */
c277df42 2281 if (OP(scan) != CURLYX) {
a3b680e6 2282 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2283 ? I32_MAX
2284 /* I32 may be smaller than U16 on CRAYs! */
2285 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2286 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2287 int noff;
2288 regnode *n = scan;
b81d288d 2289
c277df42
IZ
2290 /* Skip NOTHING and LONGJMP. */
2291 while ((n = regnext(n))
3dab1dad 2292 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2293 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2294 && off + noff < max)
2295 off += noff;
2296 if (reg_off_by_arg[OP(scan)])
2297 ARG(scan) = off;
b81d288d 2298 else
c277df42
IZ
2299 NEXT_OFF(scan) = off;
2300 }
a3621e74 2301
07be1b83 2302
3dab1dad 2303
653099ff
GS
2304 /* The principal pseudo-switch. Cannot be a switch, since we
2305 look into several different things. */
b81d288d 2306 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2307 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2308 next = regnext(scan);
2309 code = OP(scan);
a3621e74 2310 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2311
2312 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2313 /* NOTE - There is similar code to this block below for handling
2314 TRIE nodes on a re-study. If you change stuff here check there
2315 too. */
c277df42 2316 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2317 struct regnode_charclass_class accum;
d4c19fe8 2318 regnode * const startbranch=scan;
c277df42 2319
653099ff 2320 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2321 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2322 if (flags & SCF_DO_STCLASS)
830247a4 2323 cl_init_zero(pRExC_state, &accum);
a3621e74 2324
c277df42 2325 while (OP(scan) == code) {
830247a4 2326 I32 deltanext, minnext, f = 0, fake;
653099ff 2327 struct regnode_charclass_class this_class;
c277df42
IZ
2328
2329 num++;
2330 data_fake.flags = 0;
b81d288d 2331 if (data) {
2c2d71f5 2332 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2333 data_fake.last_closep = data->last_closep;
2334 }
2335 else
2336 data_fake.last_closep = &fake;
c277df42
IZ
2337 next = regnext(scan);
2338 scan = NEXTOPER(scan);
2339 if (code != BRANCH)
2340 scan = NEXTOPER(scan);
653099ff 2341 if (flags & SCF_DO_STCLASS) {
830247a4 2342 cl_init(pRExC_state, &this_class);
653099ff
GS
2343 data_fake.start_class = &this_class;
2344 f = SCF_DO_STCLASS_AND;
b81d288d 2345 }
e1901655
IZ
2346 if (flags & SCF_WHILEM_VISITED_POS)
2347 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2348
653099ff 2349 /* we suppose the run is continuous, last=next...*/
1de06328 2350 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2351 next, &data_fake, f,depth+1);
b81d288d 2352 if (min1 > minnext)
c277df42
IZ
2353 min1 = minnext;
2354 if (max1 < minnext + deltanext)
2355 max1 = minnext + deltanext;
2356 if (deltanext == I32_MAX)
aca2d497 2357 is_inf = is_inf_internal = 1;
c277df42
IZ
2358 scan = next;
2359 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2360 pars++;
3dab1dad
YO
2361 if (data) {
2362 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2363 data->flags |= SF_HAS_EVAL;
2c2d71f5 2364 data->whilem_c = data_fake.whilem_c;
3dab1dad 2365 }
653099ff 2366 if (flags & SCF_DO_STCLASS)
830247a4 2367 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2368 if (code == SUSPEND)
c277df42
IZ
2369 break;
2370 }
2371 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2372 min1 = 0;
2373 if (flags & SCF_DO_SUBSTR) {
2374 data->pos_min += min1;
2375 data->pos_delta += max1 - min1;
2376 if (max1 != min1 || is_inf)
2377 data->longest = &(data->longest_float);
2378 }
2379 min += min1;
2380 delta += max1 - min1;
653099ff 2381 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2382 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2383 if (min1) {
2384 cl_and(data->start_class, &and_with);
2385 flags &= ~SCF_DO_STCLASS;
2386 }
2387 }
2388 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2389 if (min1) {
2390 cl_and(data->start_class, &accum);
653099ff 2391 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2392 }
2393 else {
b81d288d 2394 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2395 * data->start_class */
2396 StructCopy(data->start_class, &and_with,
2397 struct regnode_charclass_class);
2398 flags &= ~SCF_DO_STCLASS_AND;
2399 StructCopy(&accum, data->start_class,
2400 struct regnode_charclass_class);
2401 flags |= SCF_DO_STCLASS_OR;
2402 data->start_class->flags |= ANYOF_EOS;
2403 }
653099ff 2404 }
a3621e74 2405
786e8c11 2406 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2407 /* demq.
2408
2409 Assuming this was/is a branch we are dealing with: 'scan' now
2410 points at the item that follows the branch sequence, whatever
2411 it is. We now start at the beginning of the sequence and look
2412 for subsequences of
2413
786e8c11
YO
2414 BRANCH->EXACT=>x1
2415 BRANCH->EXACT=>x2
2416 tail
a3621e74
YO
2417
2418 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2419
2420 If we can find such a subseqence we need to turn the first
2421 element into a trie and then add the subsequent branch exact
2422 strings to the trie.
2423
2424 We have two cases
2425
786e8c11 2426 1. patterns where the whole set of branch can be converted.
a3621e74 2427
786e8c11 2428 2. patterns where only a subset can be converted.
a3621e74
YO
2429
2430 In case 1 we can replace the whole set with a single regop
2431 for the trie. In case 2 we need to keep the start and end
2432 branchs so
2433
2434 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2435 becomes BRANCH TRIE; BRANCH X;
2436
786e8c11
YO
2437 There is an additional case, that being where there is a
2438 common prefix, which gets split out into an EXACT like node
2439 preceding the TRIE node.
2440
2441 If x(1..n)==tail then we can do a simple trie, if not we make
2442 a "jump" trie, such that when we match the appropriate word
2443 we "jump" to the appopriate tail node. Essentailly we turn
2444 a nested if into a case structure of sorts.
a3621e74
YO
2445
2446 */
786e8c11 2447
3dab1dad 2448 int made=0;
0111c4fd
RGS
2449 if (!re_trie_maxbuff) {
2450 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2451 if (!SvIOK(re_trie_maxbuff))
2452 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2453 }
786e8c11 2454 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2455 regnode *cur;
2456 regnode *first = (regnode *)NULL;
2457 regnode *last = (regnode *)NULL;
2458 regnode *tail = scan;
2459 U8 optype = 0;
2460 U32 count=0;
2461
2462#ifdef DEBUGGING
c445ea15 2463 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2464#endif
2465 /* var tail is used because there may be a TAIL
2466 regop in the way. Ie, the exacts will point to the
2467 thing following the TAIL, but the last branch will
2468 point at the TAIL. So we advance tail. If we
2469 have nested (?:) we may have to move through several
2470 tails.
2471 */
2472
2473 while ( OP( tail ) == TAIL ) {
2474 /* this is the TAIL generated by (?:) */
2475 tail = regnext( tail );
2476 }
2477
3dab1dad 2478
a3621e74 2479 DEBUG_OPTIMISE_r({
32fc9b6a 2480 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2481 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2482 (int)depth * 2 + 2, "",
2483 "Looking for TRIE'able sequences. Tail node is: ",
2484 SvPV_nolen_const( mysv )
a3621e74
YO
2485 );
2486 });
3dab1dad 2487
a3621e74
YO
2488 /*
2489
2490 step through the branches, cur represents each
2491 branch, noper is the first thing to be matched
2492 as part of that branch and noper_next is the
2493 regnext() of that node. if noper is an EXACT
2494 and noper_next is the same as scan (our current
2495 position in the regex) then the EXACT branch is
2496 a possible optimization target. Once we have
2497 two or more consequetive such branches we can
2498 create a trie of the EXACT's contents and stich
2499 it in place. If the sequence represents all of
2500 the branches we eliminate the whole thing and
2501 replace it with a single TRIE. If it is a
2502 subsequence then we need to stitch it in. This
2503 means the first branch has to remain, and needs
2504 to be repointed at the item on the branch chain
2505 following the last branch optimized. This could
2506 be either a BRANCH, in which case the
2507 subsequence is internal, or it could be the
2508 item following the branch sequence in which
2509 case the subsequence is at the end.
2510
2511 */
2512
2513 /* dont use tail as the end marker for this traverse */
2514 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2515 regnode * const noper = NEXTOPER( cur );
be981c67 2516#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2517 regnode * const noper_next = regnext( noper );
be981c67 2518#endif
a3621e74 2519
a3621e74 2520 DEBUG_OPTIMISE_r({
32fc9b6a 2521 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2522 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2523 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2524
32fc9b6a 2525 regprop(RExC_rx, mysv, noper);
a3621e74 2526 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2527 SvPV_nolen_const(mysv));
a3621e74
YO
2528
2529 if ( noper_next ) {
32fc9b6a 2530 regprop(RExC_rx, mysv, noper_next );
a3621e74 2531 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2532 SvPV_nolen_const(mysv));
a3621e74 2533 }
3dab1dad
YO
2534 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2535 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2536 });
3dab1dad
YO
2537 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2538 : PL_regkind[ OP( noper ) ] == EXACT )
2539 || OP(noper) == NOTHING )
786e8c11
YO
2540#ifdef NOJUMPTRIE
2541 && noper_next == tail
2542#endif
2543 && count < U16_MAX)
a3621e74
YO
2544 {
2545 count++;
3dab1dad
YO
2546 if ( !first || optype == NOTHING ) {
2547 if (!first) first = cur;
a3621e74
YO
2548 optype = OP( noper );
2549 } else {
a3621e74 2550 last = cur;
a3621e74
YO
2551 }
2552 } else {
2553 if ( last ) {
786e8c11
YO
2554 make_trie( pRExC_state,
2555 startbranch, first, cur, tail, count,
2556 optype, depth+1 );
a3621e74 2557 }
3dab1dad 2558 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2559#ifdef NOJUMPTRIE
2560 && noper_next == tail
2561#endif
2562 ){
a3621e74
YO
2563 count = 1;
2564 first = cur;
2565 optype = OP( noper );
2566 } else {
2567 count = 0;
2568 first = NULL;
2569 optype = 0;
2570 }
2571 last = NULL;
2572 }
2573 }
2574 DEBUG_OPTIMISE_r({
32fc9b6a 2575 regprop(RExC_rx, mysv, cur);
a3621e74 2576 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2577 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2578 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2579
2580 });
2581 if ( last ) {
786e8c11 2582 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2583#ifdef TRIE_STUDY_OPT
786e8c11
YO
2584 if ( ((made == MADE_EXACT_TRIE &&
2585 startbranch == first)
2586 || ( first_non_open == first )) &&
2587 depth==0 )
2588 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2589#endif
07be1b83 2590 }
a3621e74 2591 }
3dab1dad
YO
2592
2593 } /* do trie */
786e8c11 2594
a0ed51b3 2595 }
a3621e74 2596 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2597 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2598 } else /* single branch is optimized. */
c277df42
IZ
2599 scan = NEXTOPER(scan);
2600 continue;
a0ed51b3
LW
2601 }
2602 else if (OP(scan) == EXACT) {
cd439c50 2603 I32 l = STR_LEN(scan);
c445ea15 2604 UV uc;
a0ed51b3 2605 if (UTF) {
a3b680e6 2606 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2607 l = utf8_length(s, s + l);
9041c2e3 2608 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2609 } else {
2610 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2611 }
2612 min += l;
c277df42 2613 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2614 /* The code below prefers earlier match for fixed
2615 offset, later match for variable offset. */
2616 if (data->last_end == -1) { /* Update the start info. */
2617 data->last_start_min = data->pos_min;
2618 data->last_start_max = is_inf
b81d288d 2619 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2620 }
cd439c50 2621 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2622 if (UTF)
2623 SvUTF8_on(data->last_found);
0eda9292 2624 {
9a957fbc 2625 SV * const sv = data->last_found;
a28509cc 2626 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2627 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2628 if (mg && mg->mg_len >= 0)
5e43f467
JH
2629 mg->mg_len += utf8_length((U8*)STRING(scan),
2630 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2631 }
c277df42
IZ
2632 data->last_end = data->pos_min + l;
2633 data->pos_min += l; /* As in the first entry. */
2634 data->flags &= ~SF_BEFORE_EOL;
2635 }
653099ff
GS
2636 if (flags & SCF_DO_STCLASS_AND) {
2637 /* Check whether it is compatible with what we know already! */
2638 int compat = 1;
2639
1aa99e6b 2640 if (uc >= 0x100 ||
516a5887 2641 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2642 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2643 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2644 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2645 )
653099ff
GS
2646 compat = 0;
2647 ANYOF_CLASS_ZERO(data->start_class);
2648 ANYOF_BITMAP_ZERO(data->start_class);
2649 if (compat)
1aa99e6b 2650 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2651 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2652 if (uc < 0x100)
2653 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2654 }
2655 else if (flags & SCF_DO_STCLASS_OR) {
2656 /* false positive possible if the class is case-folded */
1aa99e6b 2657 if (uc < 0x100)
9b877dbb
IH
2658 ANYOF_BITMAP_SET(data->start_class, uc);
2659 else
2660 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2661 data->start_class->flags &= ~ANYOF_EOS;
2662 cl_and(data->start_class, &and_with);
2663 }
2664 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2665 }
3dab1dad 2666 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2667 I32 l = STR_LEN(scan);
1aa99e6b 2668 UV uc = *((U8*)STRING(scan));
653099ff
GS
2669
2670 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2671 if (flags & SCF_DO_SUBSTR) {
2672 assert(data);
1de06328 2673 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2674 }
a0ed51b3 2675 if (UTF) {
6136c704 2676 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2677 l = utf8_length(s, s + l);
9041c2e3 2678 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2679 }
2680 min += l;
ecaa9b9c 2681 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2682 data->pos_min += l;
653099ff
GS
2683 if (flags & SCF_DO_STCLASS_AND) {
2684 /* Check whether it is compatible with what we know already! */
2685 int compat = 1;
2686
1aa99e6b 2687 if (uc >= 0x100 ||
516a5887 2688 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2689 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2690 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2691 compat = 0;
2692 ANYOF_CLASS_ZERO(data->start_class);
2693 ANYOF_BITMAP_ZERO(data->start_class);
2694 if (compat) {
1aa99e6b 2695 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2696 data->start_class->flags &= ~ANYOF_EOS;
2697 data->start_class->flags |= ANYOF_FOLD;
2698 if (OP(scan) == EXACTFL)
2699 data->start_class->flags |= ANYOF_LOCALE;
2700 }
2701 }
2702 else if (flags & SCF_DO_STCLASS_OR) {
2703 if (data->start_class->flags & ANYOF_FOLD) {
2704 /* false positive possible if the class is case-folded.
2705 Assume that the locale settings are the same... */
1aa99e6b
IH
2706 if (uc < 0x100)
2707 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2708 data->start_class->flags &= ~ANYOF_EOS;
2709 }
2710 cl_and(data->start_class, &and_with);
2711 }
2712 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2713 }
6bda09f9
YO
2714 else if (OP(scan)==RECURSE) {
2715 ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan );
2716 }
bfed75c6 2717 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2718 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2719 I32 f = flags, pos_before = 0;
d4c19fe8 2720 regnode * const oscan = scan;
653099ff
GS
2721 struct regnode_charclass_class this_class;
2722 struct regnode_charclass_class *oclass = NULL;
727f22e3 2723 I32 next_is_eval = 0;
653099ff 2724
3dab1dad 2725 switch (PL_regkind[OP(scan)]) {
653099ff 2726 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2727 scan = NEXTOPER(scan);
2728 goto finish;
2729 case PLUS:
653099ff 2730 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2731 next = NEXTOPER(scan);
653099ff 2732 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2733 mincount = 1;
2734 maxcount = REG_INFTY;
c277df42
IZ
2735 next = regnext(scan);
2736 scan = NEXTOPER(scan);
2737 goto do_curly;
2738 }
2739 }
2740 if (flags & SCF_DO_SUBSTR)
2741 data->pos_min++;
2742 min++;
2743 /* Fall through. */
2744 case STAR:
653099ff
GS
2745 if (flags & SCF_DO_STCLASS) {
2746 mincount = 0;
b81d288d 2747 maxcount = REG_INFTY;
653099ff
GS
2748 next = regnext(scan);
2749 scan = NEXTOPER(scan);
2750 goto do_curly;
2751 }
b81d288d 2752 is_inf = is_inf_internal = 1;
c277df42
IZ
2753 scan = regnext(scan);
2754 if (flags & SCF_DO_SUBSTR) {
1de06328 2755 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2756 data->longest = &(data->longest_float);
2757 }
2758 goto optimize_curly_tail;
2759 case CURLY:
b81d288d 2760 mincount = ARG1(scan);
c277df42
IZ
2761 maxcount = ARG2(scan);
2762 next = regnext(scan);
cb434fcc
IZ
2763 if (OP(scan) == CURLYX) {
2764 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2765 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2766 }
c277df42 2767 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2768 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2769 do_curly:
2770 if (flags & SCF_DO_SUBSTR) {
1de06328 2771 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2772 pos_before = data->pos_min;
2773 }
2774 if (data) {
2775 fl = data->flags;
2776 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2777 if (is_inf)
2778 data->flags |= SF_IS_INF;
2779 }
653099ff 2780 if (flags & SCF_DO_STCLASS) {
830247a4 2781 cl_init(pRExC_state, &this_class);
653099ff
GS
2782 oclass = data->start_class;
2783 data->start_class = &this_class;
2784 f |= SCF_DO_STCLASS_AND;
2785 f &= ~SCF_DO_STCLASS_OR;
2786 }
e1901655
IZ
2787 /* These are the cases when once a subexpression
2788 fails at a particular position, it cannot succeed
2789 even after backtracking at the enclosing scope.
b81d288d 2790
e1901655
IZ
2791 XXXX what if minimal match and we are at the
2792 initial run of {n,m}? */
2793 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2794 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2795
c277df42 2796 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2797 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2798 (mincount == 0
2799 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2800
2801 if (flags & SCF_DO_STCLASS)
2802 data->start_class = oclass;
2803 if (mincount == 0 || minnext == 0) {
2804 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2805 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2806 }
2807 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2808 /* Switch to OR mode: cache the old value of
653099ff
GS
2809 * data->start_class */
2810 StructCopy(data->start_class, &and_with,
2811 struct regnode_charclass_class);
2812 flags &= ~SCF_DO_STCLASS_AND;
2813 StructCopy(&this_class, data->start_class,
2814 struct regnode_charclass_class);
2815 flags |= SCF_DO_STCLASS_OR;
2816 data->start_class->flags |= ANYOF_EOS;
2817 }
2818 } else { /* Non-zero len */
2819 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2820 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2821 cl_and(data->start_class, &and_with);
2822 }
2823 else if (flags & SCF_DO_STCLASS_AND)
2824 cl_and(data->start_class, &this_class);
2825 flags &= ~SCF_DO_STCLASS;
2826 }
c277df42
IZ
2827 if (!scan) /* It was not CURLYX, but CURLY. */
2828 scan = next;
041457d9
DM
2829 if ( /* ? quantifier ok, except for (?{ ... }) */
2830 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2831 && (minnext == 0) && (deltanext == 0)
99799961 2832 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2833 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2834 && ckWARN(WARN_REGEXP))
b45f050a 2835 {
830247a4 2836 vWARN(RExC_parse,
b45f050a
JF
2837 "Quantifier unexpected on zero-length expression");
2838 }
2839
c277df42 2840 min += minnext * mincount;
b81d288d 2841 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2842 && (minnext + deltanext) > 0)
2843 || deltanext == I32_MAX);
aca2d497 2844 is_inf |= is_inf_internal;
c277df42
IZ
2845 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2846
2847 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2848 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2849 && data->flags & SF_IN_PAR
2850 && !(data->flags & SF_HAS_EVAL)
2851 && !deltanext && minnext == 1 ) {
2852 /* Try to optimize to CURLYN. */
2853 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2854 regnode * const nxt1 = nxt;
497b47a8
JH
2855#ifdef DEBUGGING
2856 regnode *nxt2;
2857#endif
c277df42
IZ
2858
2859 /* Skip open. */
2860 nxt = regnext(nxt);
bfed75c6 2861 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2862 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2863 && STR_LEN(nxt) == 1))
c277df42 2864 goto nogo;
497b47a8 2865#ifdef DEBUGGING
c277df42 2866 nxt2 = nxt;
497b47a8 2867#endif
c277df42 2868 nxt = regnext(nxt);
b81d288d 2869 if (OP(nxt) != CLOSE)
c277df42
IZ
2870 goto nogo;
2871 /* Now we know that nxt2 is the only contents: */
eb160463 2872 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2873 OP(oscan) = CURLYN;
2874 OP(nxt1) = NOTHING; /* was OPEN. */
2875#ifdef DEBUGGING
2876 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2877 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2878 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2879 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2880 OP(nxt + 1) = OPTIMIZED; /* was count. */
2881 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2882#endif
c277df42 2883 }
c277df42
IZ
2884 nogo:
2885
2886 /* Try optimization CURLYX => CURLYM. */
b81d288d 2887 if ( OP(oscan) == CURLYX && data
c277df42 2888 && !(data->flags & SF_HAS_PAR)
c277df42 2889 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2890 && !deltanext /* atom is fixed width */
2891 && minnext != 0 /* CURLYM can't handle zero width */
2892 ) {
c277df42
IZ
2893 /* XXXX How to optimize if data == 0? */
2894 /* Optimize to a simpler form. */
2895 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2896 regnode *nxt2;
2897
2898 OP(oscan) = CURLYM;
2899 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2900 && (OP(nxt2) != WHILEM))
c277df42
IZ
2901 nxt = nxt2;
2902 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2903 /* Need to optimize away parenths. */
2904 if (data->flags & SF_IN_PAR) {
2905 /* Set the parenth number. */
2906 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2907
b81d288d 2908 if (OP(nxt) != CLOSE)
b45f050a 2909 FAIL("Panic opt close");
eb160463 2910 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2911 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2912 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2913#ifdef DEBUGGING
2914 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2915 OP(nxt + 1) = OPTIMIZED; /* was count. */
2916 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2917 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2918#endif
c277df42
IZ
2919#if 0
2920 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2921 regnode *nnxt = regnext(nxt1);
b81d288d 2922
c277df42
IZ
2923 if (nnxt == nxt) {
2924 if (reg_off_by_arg[OP(nxt1)])
2925 ARG_SET(nxt1, nxt2 - nxt1);
2926 else if (nxt2 - nxt1 < U16_MAX)
2927 NEXT_OFF(nxt1) = nxt2 - nxt1;
2928 else
2929 OP(nxt) = NOTHING; /* Cannot beautify */
2930 }
2931 nxt1 = nnxt;
2932 }
2933#endif
2934 /* Optimize again: */
1de06328 2935 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2936 NULL, 0,depth+1);
a0ed51b3
LW
2937 }
2938 else
c277df42 2939 oscan->flags = 0;
c277df42 2940 }
e1901655
IZ
2941 else if ((OP(oscan) == CURLYX)
2942 && (flags & SCF_WHILEM_VISITED_POS)
2943 /* See the comment on a similar expression above.
2944 However, this time it not a subexpression
2945 we care about, but the expression itself. */
2946 && (maxcount == REG_INFTY)
2947 && data && ++data->whilem_c < 16) {
2948 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2949 /* Find WHILEM (as in regexec.c) */
2950 regnode *nxt = oscan + NEXT_OFF(oscan);
2951
2952 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2953 nxt += ARG(nxt);
eb160463
GS
2954 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2955 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2956 }
b81d288d 2957 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2958 pars++;
2959 if (flags & SCF_DO_SUBSTR) {
c445ea15 2960 SV *last_str = NULL;
c277df42
IZ
2961 int counted = mincount != 0;
2962
2963 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2964#if defined(SPARC64_GCC_WORKAROUND)
2965 I32 b = 0;
2966 STRLEN l = 0;
cfd0369c 2967 const char *s = NULL;
5d1c421c
JH
2968 I32 old = 0;
2969
2970 if (pos_before >= data->last_start_min)
2971 b = pos_before;
2972 else
2973 b = data->last_start_min;
2974
2975 l = 0;
cfd0369c 2976 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2977 old = b - data->last_start_min;
2978
2979#else
b81d288d 2980 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2981 ? pos_before : data->last_start_min;
2982 STRLEN l;
d4c19fe8 2983 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2984 I32 old = b - data->last_start_min;
5d1c421c 2985#endif
a0ed51b3
LW
2986
2987 if (UTF)
2988 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2989
a0ed51b3 2990 l -= old;
c277df42 2991 /* Get the added string: */
79cb57f6 2992 last_str = newSVpvn(s + old, l);
0e933229
IH
2993 if (UTF)
2994 SvUTF8_on(last_str);
c277df42
IZ
2995 if (deltanext == 0 && pos_before == b) {
2996 /* What was added is a constant string */
2997 if (mincount > 1) {
2998 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2999 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3000 SvPVX_const(last_str), l, mincount - 1);
b162af07 3001 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3002 /* Add additional parts. */
b81d288d 3003 SvCUR_set(data->last_found,
c277df42
IZ
3004 SvCUR(data->last_found) - l);
3005 sv_catsv(data->last_found, last_str);
0eda9292
JH
3006 {
3007 SV * sv = data->last_found;
3008 MAGIC *mg =
3009 SvUTF8(sv) && SvMAGICAL(sv) ?
3010 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3011 if (mg && mg->mg_len >= 0)
3012 mg->mg_len += CHR_SVLEN(last_str);
3013 }
c277df42
IZ
3014 data->last_end += l * (mincount - 1);
3015 }
2a8d9689
HS
3016 } else {
3017 /* start offset must point into the last copy */
3018 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3019 data->last_start_max += is_inf ? I32_MAX
3020 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3021 }
3022 }
3023 /* It is counted once already... */
3024 data->pos_min += minnext * (mincount - counted);
3025 data->pos_delta += - counted * deltanext +
3026 (minnext + deltanext) * maxcount - minnext * mincount;
3027 if (mincount != maxcount) {
653099ff
GS
3028 /* Cannot extend fixed substrings found inside
3029 the group. */
1de06328 3030 scan_commit(pRExC_state,data,minlenp);
c277df42 3031 if (mincount && last_str) {
d4c19fe8
AL
3032 SV * const sv = data->last_found;
3033 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3034 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3035
3036 if (mg)
3037 mg->mg_len = -1;
3038 sv_setsv(sv, last_str);
c277df42 3039 data->last_end = data->pos_min;
b81d288d 3040 data->last_start_min =
a0ed51b3 3041 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3042 data->last_start_max = is_inf
3043 ? I32_MAX
c277df42 3044 : data->pos_min + data->pos_delta
a0ed51b3 3045 - CHR_SVLEN(last_str);
c277df42
IZ
3046 }
3047 data->longest = &(data->longest_float);
3048 }
aca2d497 3049 SvREFCNT_dec(last_str);
c277df42 3050 }
405ff068 3051 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3052 data->flags |= SF_HAS_EVAL;
3053 optimize_curly_tail:
c277df42 3054 if (OP(oscan) != CURLYX) {
3dab1dad 3055 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3056 && NEXT_OFF(next))
3057 NEXT_OFF(oscan) += NEXT_OFF(next);
3058 }
c277df42 3059 continue;
653099ff 3060 default: /* REF and CLUMP only? */
c277df42 3061 if (flags & SCF_DO_SUBSTR) {
1de06328 3062 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3063 data->longest = &(data->longest_float);
3064 }
aca2d497 3065 is_inf = is_inf_internal = 1;
653099ff 3066 if (flags & SCF_DO_STCLASS_OR)
830247a4 3067 cl_anything(pRExC_state, data->start_class);
653099ff 3068 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3069 break;
3070 }
a0ed51b3 3071 }
bfed75c6 3072 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3073 int value = 0;
653099ff 3074
c277df42 3075 if (flags & SCF_DO_SUBSTR) {
1de06328 3076 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3077 data->pos_min++;
3078 }
3079 min++;
653099ff
GS
3080 if (flags & SCF_DO_STCLASS) {
3081 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3082
3083 /* Some of the logic below assumes that switching
3084 locale on will only add false positives. */
3dab1dad 3085 switch (PL_regkind[OP(scan)]) {
653099ff 3086 case SANY:
653099ff
GS
3087 default:
3088 do_default:
3089 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3090 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3091 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3092 break;
3093 case REG_ANY:
3094 if (OP(scan) == SANY)
3095 goto do_default;
3096 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3097 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3098 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3099 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3100 }
3101 if (flags & SCF_DO_STCLASS_AND || !value)
3102 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3103 break;
3104 case ANYOF:
3105 if (flags & SCF_DO_STCLASS_AND)
3106 cl_and(data->start_class,
3107 (struct regnode_charclass_class*)scan);
3108 else
830247a4 3109 cl_or(pRExC_state, data->start_class,
653099ff
GS
3110 (struct regnode_charclass_class*)scan);
3111 break;
3112 case ALNUM:
3113 if (flags & SCF_DO_STCLASS_AND) {
3114 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3115 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3116 for (value = 0; value < 256; value++)
3117 if (!isALNUM(value))
3118 ANYOF_BITMAP_CLEAR(data->start_class, value);
3119 }
3120 }
3121 else {
3122 if (data->start_class->flags & ANYOF_LOCALE)
3123 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3124 else {
3125 for (value = 0; value < 256; value++)
3126 if (isALNUM(value))
b81d288d 3127 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3128 }
3129 }
3130 break;
3131 case ALNUML:
3132 if (flags & SCF_DO_STCLASS_AND) {
3133 if (data->start_class->flags & ANYOF_LOCALE)
3134 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3135 }
3136 else {
3137 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3138 data->start_class->flags |= ANYOF_LOCALE;
3139 }
3140 break;
3141 case NALNUM:
3142 if (flags & SCF_DO_STCLASS_AND) {
3143 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3144 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3145 for (value = 0; value < 256; value++)
3146 if (isALNUM(value))
3147 ANYOF_BITMAP_CLEAR(data->start_class, value);
3148 }
3149 }
3150 else {
3151 if (data->start_class->flags & ANYOF_LOCALE)
3152 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3153 else {
3154 for (value = 0; value < 256; value++)
3155 if (!isALNUM(value))
b81d288d 3156 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3157 }
3158 }
3159 break;
3160 case NALNUML:
3161 if (flags & SCF_DO_STCLASS_AND) {
3162 if (data->start_class->flags & ANYOF_LOCALE)
3163 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3164 }
3165 else {
3166 data->start_class->flags |= ANYOF_LOCALE;
3167 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3168 }
3169 break;
3170 case SPACE:
3171 if (flags & SCF_DO_STCLASS_AND) {
3172 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3173 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3174 for (value = 0; value < 256; value++)
3175 if (!isSPACE(value))
3176 ANYOF_BITMAP_CLEAR(data->start_class, value);
3177 }
3178 }
3179 else {
3180 if (data->start_class->flags & ANYOF_LOCALE)
3181 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3182 else {
3183 for (value = 0; value < 256; value++)
3184 if (isSPACE(value))
b81d288d 3185 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3186 }
3187 }
3188 break;
3189 case SPACEL:
3190 if (flags & SCF_DO_STCLASS_AND) {
3191 if (data->start_class->flags & ANYOF_LOCALE)
3192 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3193 }
3194 else {
3195 data->start_class->flags |= ANYOF_LOCALE;
3196 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3197 }
3198 break;
3199 case NSPACE:
3200 if (flags & SCF_DO_STCLASS_AND) {
3201 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3202 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3203 for (value = 0; value < 256; value++)
3204 if (isSPACE(value))
3205 ANYOF_BITMAP_CLEAR(data->start_class, value);
3206 }
3207 }
3208 else {
3209 if (data->start_class->flags & ANYOF_LOCALE)
3210 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3211 else {
3212 for (value = 0; value < 256; value++)
3213 if (!isSPACE(value))
b81d288d 3214 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3215 }
3216 }
3217 break;
3218 case NSPACEL:
3219 if (flags & SCF_DO_STCLASS_AND) {
3220 if (data->start_class->flags & ANYOF_LOCALE) {
3221 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3222 for (value = 0; value < 256; value++)
3223 if (!isSPACE(value))
3224 ANYOF_BITMAP_CLEAR(data->start_class, value);
3225 }
3226 }
3227 else {
3228 data->start_class->flags |= ANYOF_LOCALE;
3229 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3230 }
3231 break;
3232 case DIGIT:
3233 if (flags & SCF_DO_STCLASS_AND) {
3234 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3235 for (value = 0; value < 256; value++)
3236 if (!isDIGIT(value))
3237 ANYOF_BITMAP_CLEAR(data->start_class, value);
3238 }
3239 else {
3240 if (data->start_class->flags & ANYOF_LOCALE)
3241 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3242 else {
3243 for (value = 0; value < 256; value++)
3244 if (isDIGIT(value))
b81d288d 3245 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3246 }
3247 }
3248 break;
3249 case NDIGIT:
3250 if (flags & SCF_DO_STCLASS_AND) {
3251 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3252 for (value = 0; value < 256; value++)
3253 if (isDIGIT(value))
3254 ANYOF_BITMAP_CLEAR(data->start_class, value);
3255 }
3256 else {
3257 if (data->start_class->flags & ANYOF_LOCALE)
3258 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3259 else {
3260 for (value = 0; value < 256; value++)
3261 if (!isDIGIT(value))
b81d288d 3262 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3263 }
3264 }
3265 break;
3266 }
3267 if (flags & SCF_DO_STCLASS_OR)
3268 cl_and(data->start_class, &and_with);
3269 flags &= ~SCF_DO_STCLASS;
3270 }
a0ed51b3 3271 }
3dab1dad 3272 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3273 data->flags |= (OP(scan) == MEOL
3274 ? SF_BEFORE_MEOL
3275 : SF_BEFORE_SEOL);
a0ed51b3 3276 }
3dab1dad 3277 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3278 /* Lookbehind, or need to calculate parens/evals/stclass: */
3279 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3280 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3281 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3282 || OP(scan) == UNLESSM )
3283 {
3284 /* Negative Lookahead/lookbehind
3285 In this case we can't do fixed string optimisation.
3286 */
3287
3288 I32 deltanext, minnext, fake = 0;
3289 regnode *nscan;
3290 struct regnode_charclass_class intrnl;
3291 int f = 0;
3292
3293 data_fake.flags = 0;
3294 if (data) {
3295 data_fake.whilem_c = data->whilem_c;
3296 data_fake.last_closep = data->last_closep;
a0ed51b3 3297 }
1de06328
YO
3298 else
3299 data_fake.last_closep = &fake;
3300 if ( flags & SCF_DO_STCLASS && !scan->flags
3301 && OP(scan) == IFMATCH ) { /* Lookahead */
3302 cl_init(pRExC_state, &intrnl);
3303 data_fake.start_class = &intrnl;
3304 f |= SCF_DO_STCLASS_AND;
c277df42 3305 }
1de06328
YO
3306 if (flags & SCF_WHILEM_VISITED_POS)
3307 f |= SCF_WHILEM_VISITED_POS;
3308 next = regnext(scan);
3309 nscan = NEXTOPER(NEXTOPER(scan));
3310 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3311 if (scan->flags) {
3312 if (deltanext) {
3313 vFAIL("Variable length lookbehind not implemented");
3314 }
3315 else if (minnext > (I32)U8_MAX) {
3316 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3317 }
3318 scan->flags = (U8)minnext;
3319 }
3320 if (data) {
3321 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3322 pars++;
3323 if (data_fake.flags & SF_HAS_EVAL)
3324 data->flags |= SF_HAS_EVAL;
3325 data->whilem_c = data_fake.whilem_c;
3326 }
3327 if (f & SCF_DO_STCLASS_AND) {
3328 const int was = (data->start_class->flags & ANYOF_EOS);
3329
3330 cl_and(data->start_class, &intrnl);
3331 if (was)
3332 data->start_class->flags |= ANYOF_EOS;
3333 }
be8e71aa 3334 }
1de06328
YO
3335#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3336 else {
3337 /* Positive Lookahead/lookbehind
3338 In this case we can do fixed string optimisation,
3339 but we must be careful about it. Note in the case of
3340 lookbehind the positions will be offset by the minimum
3341 length of the pattern, something we won't know about
3342 until after the recurse.
3343 */
3344 I32 deltanext, fake = 0;
3345 regnode *nscan;
3346 struct regnode_charclass_class intrnl;
3347 int f = 0;
3348 /* We use SAVEFREEPV so that when the full compile
3349 is finished perl will clean up the allocated
3350 minlens when its all done. This was we don't
3351 have to worry about freeing them when we know
3352 they wont be used, which would be a pain.
3353 */
3354 I32 *minnextp;
3355 Newx( minnextp, 1, I32 );
3356 SAVEFREEPV(minnextp);
3357
3358 if (data) {
3359 StructCopy(data, &data_fake, scan_data_t);
3360 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3361 f |= SCF_DO_SUBSTR;
3362 if (scan->flags)
3363 scan_commit(pRExC_state, &data_fake,minlenp);
3364 data_fake.last_found=newSVsv(data->last_found);
3365 }
3366 }
3367 else
3368 data_fake.last_closep = &fake;
3369 data_fake.flags = 0;
3370 if (is_inf)
3371 data_fake.flags |= SF_IS_INF;
3372 if ( flags & SCF_DO_STCLASS && !scan->flags
3373 && OP(scan) == IFMATCH ) { /* Lookahead */
3374 cl_init(pRExC_state, &intrnl);
3375 data_fake.start_class = &intrnl;
3376 f |= SCF_DO_STCLASS_AND;
3377 }
3378 if (flags & SCF_WHILEM_VISITED_POS)
3379 f |= SCF_WHILEM_VISITED_POS;
3380 next = regnext(scan);
3381 nscan = NEXTOPER(NEXTOPER(scan));
3382
3383 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3384 if (scan->flags) {
3385 if (deltanext) {
3386 vFAIL("Variable length lookbehind not implemented");
3387 }
3388 else if (*minnextp > (I32)U8_MAX) {
3389 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3390 }
3391 scan->flags = (U8)*minnextp;
3392 }
3393
3394 *minnextp += min;
3395
3396
3397 if (f & SCF_DO_STCLASS_AND) {
3398 const int was = (data->start_class->flags & ANYOF_EOS);
3399
3400 cl_and(data->start_class, &intrnl);
3401 if (was)
3402 data->start_class->flags |= ANYOF_EOS;
3403 }
3404 if (data) {
3405 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3406 pars++;
3407 if (data_fake.flags & SF_HAS_EVAL)
3408 data->flags |= SF_HAS_EVAL;
3409 data->whilem_c = data_fake.whilem_c;
3410 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3411 if (RExC_rx->minlen<*minnextp)
3412 RExC_rx->minlen=*minnextp;
3413 scan_commit(pRExC_state, &data_fake, minnextp);
3414 SvREFCNT_dec(data_fake.last_found);
3415
3416 if ( data_fake.minlen_fixed != minlenp )
3417 {
3418 data->offset_fixed= data_fake.offset_fixed;
3419 data->minlen_fixed= data_fake.minlen_fixed;
3420 data->lookbehind_fixed+= scan->flags;
3421 }
3422 if ( data_fake.minlen_float != minlenp )
3423 {
3424 data->minlen_float= data_fake.minlen_float;
3425 data->offset_float_min=data_fake.offset_float_min;
3426 data->offset_float_max=data_fake.offset_float_max;
3427 data->lookbehind_float+= scan->flags;
3428 }
3429 }
3430 }
3431
653099ff 3432
653099ff 3433 }
1de06328 3434#endif
a0ed51b3
LW
3435 }
3436 else if (OP(scan) == OPEN) {
c277df42 3437 pars++;
a0ed51b3 3438 }
cb434fcc 3439 else if (OP(scan) == CLOSE) {
eb160463 3440 if ((I32)ARG(scan) == is_par) {
cb434fcc 3441 next = regnext(scan);
c277df42 3442
cb434fcc
IZ
3443 if ( next && (OP(next) != WHILEM) && next < last)
3444 is_par = 0; /* Disable optimization */
3445 }
3446 if (data)
3447 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3448 }
3449 else if (OP(scan) == EVAL) {
c277df42
IZ
3450 if (data)
3451 data->flags |= SF_HAS_EVAL;
3452 }
96776eda 3453 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3454 if (flags & SCF_DO_SUBSTR) {
1de06328 3455 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3456 data->longest = &(data->longest_float);
3457 }
3458 is_inf = is_inf_internal = 1;
653099ff 3459 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3460 cl_anything(pRExC_state, data->start_class);
96776eda 3461 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3462 }
786e8c11
YO
3463#ifdef TRIE_STUDY_OPT
3464#ifdef FULL_TRIE_STUDY
3465 else if (PL_regkind[OP(scan)] == TRIE) {
3466 /* NOTE - There is similar code to this block above for handling
3467 BRANCH nodes on the initial study. If you change stuff here
3468 check there too. */
3469 regnode *tail= regnext(scan);
3470 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3471 I32 max1 = 0, min1 = I32_MAX;
3472 struct regnode_charclass_class accum;
3473
3474 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3475 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3476 if (flags & SCF_DO_STCLASS)
3477 cl_init_zero(pRExC_state, &accum);
3478
3479 if (!trie->jump) {
3480 min1= trie->minlen;
3481 max1= trie->maxlen;
3482 } else {
3483 const regnode *nextbranch= NULL;
3484 U32 word;
3485
3486 for ( word=1 ; word <= trie->wordcount ; word++)
3487 {
3488 I32 deltanext=0, minnext=0, f = 0, fake;
3489 struct regnode_charclass_class this_class;
3490
3491 data_fake.flags = 0;
3492 if (data) {
3493 data_fake.whilem_c = data->whilem_c;
3494 data_fake.last_closep = data->last_closep;
3495 }
3496 else
3497 data_fake.last_closep = &fake;
3498
3499 if (flags & SCF_DO_STCLASS) {
3500 cl_init(pRExC_state, &this_class);
3501 data_fake.start_class = &this_class;
3502 f = SCF_DO_STCLASS_AND;
3503 }
3504 if (flags & SCF_WHILEM_VISITED_POS)
3505 f |= SCF_WHILEM_VISITED_POS;
3506
3507 if (trie->jump[word]) {
3508 if (!nextbranch)
3509 nextbranch = tail - trie->jump[0];
3510 scan= tail - trie->jump[word];
3511 /* We go from the jump point to the branch that follows
3512 it. Note this means we need the vestigal unused branches
3513 even though they arent otherwise used.
3514 */
1de06328 3515 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3516 (regnode *)nextbranch, &data_fake, f,depth+1);
3517 }
3518 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3519 nextbranch= regnext((regnode*)nextbranch);
3520
3521 if (min1 > (I32)(minnext + trie->minlen))
3522 min1 = minnext + trie->minlen;
3523 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3524 max1 = minnext + deltanext + trie->maxlen;
3525 if (deltanext == I32_MAX)
3526 is_inf = is_inf_internal = 1;
3527
3528 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3529 pars++;
3530
3531 if (data) {
3532 if (data_fake.flags & SF_HAS_EVAL)
3533 data->flags |= SF_HAS_EVAL;
3534 data->whilem_c = data_fake.whilem_c;
3535 }
3536 if (flags & SCF_DO_STCLASS)
3537 cl_or(pRExC_state, &accum, &this_class);
3538 }
3539 }
3540 if (flags & SCF_DO_SUBSTR) {
3541 data->pos_min += min1;
3542 data->pos_delta += max1 - min1;
3543 if (max1 != min1 || is_inf)
3544 data->longest = &(data->longest_float);
3545 }
3546 min += min1;
3547 delta += max1 - min1;
3548 if (flags & SCF_DO_STCLASS_OR) {
3549 cl_or(pRExC_state, data->start_class, &accum);
3550 if (min1) {
3551 cl_and(data->start_class, &and_with);
3552 flags &= ~SCF_DO_STCLASS;
3553 }
3554 }
3555 else if (flags & SCF_DO_STCLASS_AND) {
3556 if (min1) {
3557 cl_and(data->start_class, &accum);
3558 flags &= ~SCF_DO_STCLASS;
3559 }
3560 else {
3561 /* Switch to OR mode: cache the old value of
3562 * data->start_class */
3563 StructCopy(data->start_class, &and_with,
3564 struct regnode_charclass_class);
3565 flags &= ~SCF_DO_STCLASS_AND;
3566 StructCopy(&accum, data->start_class,
3567 struct regnode_charclass_class);
3568 flags |= SCF_DO_STCLASS_OR;
3569 data->start_class->flags |= ANYOF_EOS;
3570 }
3571 }
3572 scan= tail;
3573 continue;
3574 }
3575#else
3576 else if (PL_regkind[OP(scan)] == TRIE) {
3577 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3578 U8*bang=NULL;
3579
3580 min += trie->minlen;
3581 delta += (trie->maxlen - trie->minlen);
3582 flags &= ~SCF_DO_STCLASS; /* xxx */
3583 if (flags & SCF_DO_SUBSTR) {
1de06328 3584 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3585 data->pos_min += trie->minlen;
3586 data->pos_delta += (trie->maxlen - trie->minlen);
3587 if (trie->maxlen != trie->minlen)
3588 data->longest = &(data->longest_float);
3589 }
3590 if (trie->jump) /* no more substrings -- for now /grr*/
3591 flags &= ~SCF_DO_SUBSTR;
3592 }
3593#endif /* old or new */
3594#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3595 /* Else: zero-length, ignore. */
3596 scan = regnext(scan);
3597 }
3598
3599 finish:
3600 *scanp = scan;
aca2d497 3601 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3602 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3603 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3604 if (is_par > (I32)U8_MAX)
c277df42
IZ
3605 is_par = 0;
3606 if (is_par && pars==1 && data) {
3607 data->flags |= SF_IN_PAR;
3608 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3609 }
3610 else if (pars && data) {
c277df42
IZ
3611 data->flags |= SF_HAS_PAR;
3612 data->flags &= ~SF_IN_PAR;
3613 }
653099ff
GS
3614 if (flags & SCF_DO_STCLASS_OR)
3615 cl_and(data->start_class, &and_with);
786e8c11
YO
3616 if (flags & SCF_TRIE_RESTUDY)
3617 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3618
3619 DEBUG_STUDYDATA(data,depth);
3620
c277df42
IZ
3621 return min;
3622}
3623
76e3520e 3624STATIC I32
5f66b61c 3625S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3626{
830247a4 3627 if (RExC_rx->data) {
b81d288d
AB
3628 Renewc(RExC_rx->data,
3629 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3630 char, struct reg_data);
830247a4
IZ
3631 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3632 RExC_rx->data->count += n;
a0ed51b3
LW
3633 }
3634 else {
a02a5408 3635 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3636 char, struct reg_data);
a02a5408 3637 Newx(RExC_rx->data->what, n, U8);
830247a4 3638 RExC_rx->data->count = n;
c277df42 3639 }
830247a4
IZ
3640 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3641 return RExC_rx->data->count - n;
c277df42
IZ
3642}
3643
76234dfb 3644#ifndef PERL_IN_XSUB_RE
d88dccdf 3645void
864dbfa3 3646Perl_reginitcolors(pTHX)
d88dccdf 3647{
97aff369 3648 dVAR;
1df70142 3649 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3650 if (s) {
1df70142
AL
3651 char *t = savepv(s);
3652 int i = 0;
3653 PL_colors[0] = t;
d88dccdf 3654 while (++i < 6) {
1df70142
AL
3655 t = strchr(t, '\t');
3656 if (t) {
3657 *t = '\0';
3658 PL_colors[i] = ++t;
d88dccdf
IZ
3659 }
3660 else
1df70142 3661 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3662 }
3663 } else {
1df70142 3664 int i = 0;
b81d288d 3665 while (i < 6)
06b5626a 3666 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3667 }
3668 PL_colorset = 1;
3669}
76234dfb 3670#endif
8615cb43 3671
07be1b83 3672
786e8c11
YO
3673#ifdef TRIE_STUDY_OPT
3674#define CHECK_RESTUDY_GOTO \
3675 if ( \
3676 (data.flags & SCF_TRIE_RESTUDY) \
3677 && ! restudied++ \
3678 ) goto reStudy
3679#else
3680#define CHECK_RESTUDY_GOTO
3681#endif
f9f4320a 3682
a687059c 3683/*
e50aee73 3684 - pregcomp - compile a regular expression into internal code
a687059c
LW
3685 *
3686 * We can't allocate space until we know how big the compiled form will be,
3687 * but we can't compile it (and thus know how big it is) until we've got a
3688 * place to put the code. So we cheat: we compile it twice, once with code
3689 * generation turned off and size counting turned on, and once "for real".
3690 * This also means that we don't allocate space until we are sure that the
3691 * thing really will compile successfully, and we never have to move the
3692 * code and thus invalidate pointers into it. (Note that it has to be in
3693 * one piece because free() must be able to free it all.) [NB: not true in perl]
3694 *
3695 * Beware that the optimization-preparation code in here knows about some
3696 * of the structure of the compiled regexp. [I'll say.]
3697 */
f9f4320a
YO
3698#ifndef PERL_IN_XSUB_RE
3699#define CORE_ONLY_BLOCK(c) {c}{
3700#define RE_ENGINE_PTR &PL_core_reg_engine
3701#else
3702#define CORE_ONLY_BLOCK(c) {
3703extern const struct regexp_engine my_reg_engine;
3704#define RE_ENGINE_PTR &my_reg_engine
3705#endif
3706#define END_BLOCK }
3707
a687059c 3708regexp *
864dbfa3 3709Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3710{
97aff369 3711 dVAR;
f9f4320a
YO
3712 GET_RE_DEBUG_FLAGS_DECL;
3713 DEBUG_r(if (!PL_colorset) reginitcolors());
3714 CORE_ONLY_BLOCK(
3715 /* Dispatch a request to compile a regexp to correct
3716 regexp engine. */
3717 HV * const table = GvHV(PL_hintgv);
3718 if (table) {
3719 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3720 if (ptr && SvIOK(*ptr)) {
3721 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3722 DEBUG_COMPILE_r({
8d8756e7 3723 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3724 SvIV(*ptr));
3725 });
f2f78491 3726 return CALLREGCOMP_ENG(en