This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Mention that field hashes are for inside-out objects
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
6bda09f9 120 regnode **parens; /* offsets of each paren */
1aa99e6b 121 I32 utf8;
6bda09f9 122 HV *charnames; /* cache of named sequences */
81714fb9 123 HV *paren_names; /* Paren names */
830247a4
IZ
124#if ADD_TO_REGEXEC
125 char *starttry; /* -Dr: where regtry was called. */
126#define RExC_starttry (pRExC_state->starttry)
127#endif
3dab1dad 128#ifdef DEBUGGING
be8e71aa 129 const char *lastparse;
3dab1dad
YO
130 I32 lastnum;
131#define RExC_lastparse (pRExC_state->lastparse)
132#define RExC_lastnum (pRExC_state->lastnum)
133#endif
830247a4
IZ
134} RExC_state_t;
135
e2509266 136#define RExC_flags (pRExC_state->flags)
830247a4
IZ
137#define RExC_precomp (pRExC_state->precomp)
138#define RExC_rx (pRExC_state->rx)
fac92740 139#define RExC_start (pRExC_state->start)
830247a4
IZ
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 143#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 144#define RExC_emit (pRExC_state->emit)
fac92740 145#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
146#define RExC_naughty (pRExC_state->naughty)
147#define RExC_sawback (pRExC_state->sawback)
148#define RExC_seen (pRExC_state->seen)
149#define RExC_size (pRExC_state->size)
150#define RExC_npar (pRExC_state->npar)
151#define RExC_extralen (pRExC_state->extralen)
152#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 154#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 155#define RExC_charnames (pRExC_state->charnames)
6bda09f9 156#define RExC_parens (pRExC_state->parens)
81714fb9 157#define RExC_paren_names (pRExC_state->paren_names)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
3dab1dad
YO
175#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
176
07be1b83
YO
177/* whether trie related optimizations are enabled */
178#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
179#define TRIE_STUDY_OPT
786e8c11 180#define FULL_TRIE_STUDY
07be1b83
YO
181#define TRIE_STCLASS
182#endif
1de06328
YO
183
184
185/* About scan_data_t.
186
187 During optimisation we recurse through the regexp program performing
188 various inplace (keyhole style) optimisations. In addition study_chunk
189 and scan_commit populate this data structure with information about
190 what strings MUST appear in the pattern. We look for the longest
191 string that must appear for at a fixed location, and we look for the
192 longest string that may appear at a floating location. So for instance
193 in the pattern:
194
195 /FOO[xX]A.*B[xX]BAR/
196
197 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
198 strings (because they follow a .* construct). study_chunk will identify
199 both FOO and BAR as being the longest fixed and floating strings respectively.
200
201 The strings can be composites, for instance
202
203 /(f)(o)(o)/
204
205 will result in a composite fixed substring 'foo'.
206
207 For each string some basic information is maintained:
208
209 - offset or min_offset
210 This is the position the string must appear at, or not before.
211 It also implicitly (when combined with minlenp) tells us how many
212 character must match before the string we are searching.
213 Likewise when combined with minlenp and the length of the string
214 tells us how many characters must appear after the string we have
215 found.
216
217 - max_offset
218 Only used for floating strings. This is the rightmost point that
219 the string can appear at. Ifset to I32 max it indicates that the
220 string can occur infinitely far to the right.
221
222 - minlenp
223 A pointer to the minimum length of the pattern that the string
224 was found inside. This is important as in the case of positive
225 lookahead or positive lookbehind we can have multiple patterns
226 involved. Consider
227
228 /(?=FOO).*F/
229
230 The minimum length of the pattern overall is 3, the minimum length
231 of the lookahead part is 3, but the minimum length of the part that
232 will actually match is 1. So 'FOO's minimum length is 3, but the
233 minimum length for the F is 1. This is important as the minimum length
234 is used to determine offsets in front of and behind the string being
235 looked for. Since strings can be composites this is the length of the
236 pattern at the time it was commited with a scan_commit. Note that
237 the length is calculated by study_chunk, so that the minimum lengths
238 are not known until the full pattern has been compiled, thus the
239 pointer to the value.
240
241 - lookbehind
242
243 In the case of lookbehind the string being searched for can be
244 offset past the start point of the final matching string.
245 If this value was just blithely removed from the min_offset it would
246 invalidate some of the calculations for how many chars must match
247 before or after (as they are derived from min_offset and minlen and
248 the length of the string being searched for).
249 When the final pattern is compiled and the data is moved from the
250 scan_data_t structure into the regexp structure the information
251 about lookbehind is factored in, with the information that would
252 have been lost precalculated in the end_shift field for the
253 associated string.
254
255 The fields pos_min and pos_delta are used to store the minimum offset
256 and the delta to the maximum offset at the current point in the pattern.
257
258*/
2c2d71f5
JH
259
260typedef struct scan_data_t {
1de06328
YO
261 /*I32 len_min; unused */
262 /*I32 len_delta; unused */
2c2d71f5
JH
263 I32 pos_min;
264 I32 pos_delta;
265 SV *last_found;
1de06328 266 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
267 I32 last_start_min;
268 I32 last_start_max;
1de06328
YO
269 SV **longest; /* Either &l_fixed, or &l_float. */
270 SV *longest_fixed; /* longest fixed string found in pattern */
271 I32 offset_fixed; /* offset where it starts */
272 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
273 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
274 SV *longest_float; /* longest floating string found in pattern */
275 I32 offset_float_min; /* earliest point in string it can appear */
276 I32 offset_float_max; /* latest point in string it can appear */
277 I32 *minlen_float; /* pointer to the minlen relevent to the string */
278 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
279 I32 flags;
280 I32 whilem_c;
cb434fcc 281 I32 *last_closep;
653099ff 282 struct regnode_charclass_class *start_class;
2c2d71f5
JH
283} scan_data_t;
284
a687059c 285/*
e50aee73 286 * Forward declarations for pregcomp()'s friends.
a687059c 287 */
a0d0e21e 288
27da23d5 289static const scan_data_t zero_scan_data =
1de06328 290 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
291
292#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
293#define SF_BEFORE_SEOL 0x0001
294#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
295#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
296#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
297
09b7f37c
CB
298#ifdef NO_UNARY_PLUS
299# define SF_FIX_SHIFT_EOL (0+2)
300# define SF_FL_SHIFT_EOL (0+4)
301#else
302# define SF_FIX_SHIFT_EOL (+2)
303# define SF_FL_SHIFT_EOL (+4)
304#endif
c277df42
IZ
305
306#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
307#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
308
309#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
310#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
311#define SF_IS_INF 0x0040
312#define SF_HAS_PAR 0x0080
313#define SF_IN_PAR 0x0100
314#define SF_HAS_EVAL 0x0200
315#define SCF_DO_SUBSTR 0x0400
653099ff
GS
316#define SCF_DO_STCLASS_AND 0x0800
317#define SCF_DO_STCLASS_OR 0x1000
318#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 319#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 320
786e8c11
YO
321#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
322
07be1b83 323
eb160463 324#define UTF (RExC_utf8 != 0)
e2509266
JH
325#define LOC ((RExC_flags & PMf_LOCALE) != 0)
326#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 327
ffc61ed2 328#define OOB_UNICODE 12345678
93733859 329#define OOB_NAMEDCLASS -1
b8c5462f 330
a0ed51b3
LW
331#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
332#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
333
8615cb43 334
b45f050a
JF
335/* length of regex to show in messages that don't mark a position within */
336#define RegexLengthToShowInErrorMessages 127
337
338/*
339 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
340 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
341 * op/pragma/warn/regcomp.
342 */
7253e4e3
RK
343#define MARKER1 "<-- HERE" /* marker as it appears in the description */
344#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 345
7253e4e3 346#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
347
348/*
349 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
350 * arg. Show regex, up to a maximum length. If it's too long, chop and add
351 * "...".
352 */
ccb2c380 353#define FAIL(msg) STMT_START { \
bfed75c6 354 const char *ellipses = ""; \
ccb2c380
MP
355 IV len = RExC_end - RExC_precomp; \
356 \
357 if (!SIZE_ONLY) \
358 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
359 if (len > RegexLengthToShowInErrorMessages) { \
360 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
361 len = RegexLengthToShowInErrorMessages - 10; \
362 ellipses = "..."; \
363 } \
364 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
365 msg, (int)len, RExC_precomp, ellipses); \
366} STMT_END
8615cb43 367
b45f050a 368/*
b45f050a
JF
369 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
370 */
ccb2c380 371#define Simple_vFAIL(m) STMT_START { \
a28509cc 372 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
373 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
374 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
b45f050a
JF
376
377/*
378 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
379 */
ccb2c380
MP
380#define vFAIL(m) STMT_START { \
381 if (!SIZE_ONLY) \
382 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
383 Simple_vFAIL(m); \
384} STMT_END
b45f050a
JF
385
386/*
387 * Like Simple_vFAIL(), but accepts two arguments.
388 */
ccb2c380 389#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 390 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
391 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
392 (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
b45f050a
JF
394
395/*
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
397 */
ccb2c380
MP
398#define vFAIL2(m,a1) STMT_START { \
399 if (!SIZE_ONLY) \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
401 Simple_vFAIL2(m, a1); \
402} STMT_END
b45f050a
JF
403
404
405/*
406 * Like Simple_vFAIL(), but accepts three arguments.
407 */
ccb2c380 408#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 409 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
410 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
411 (int)offset, RExC_precomp, RExC_precomp + offset); \
412} STMT_END
b45f050a
JF
413
414/*
415 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
416 */
ccb2c380
MP
417#define vFAIL3(m,a1,a2) STMT_START { \
418 if (!SIZE_ONLY) \
419 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
420 Simple_vFAIL3(m, a1, a2); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Like Simple_vFAIL(), but accepts four arguments.
425 */
ccb2c380 426#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a 431
ccb2c380 432#define vWARN(loc,m) STMT_START { \
a28509cc 433 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
434 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
435 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
436} STMT_END
437
438#define vWARNdep(loc,m) STMT_START { \
a28509cc 439 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
440 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
441 "%s" REPORT_LOCATION, \
442 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
443} STMT_END
444
445
446#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 447 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
448 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
449 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
451
452#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
455 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
461 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
463
464#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
9d1d55b5 469
8615cb43 470
cd439c50 471/* Allow for side effects in s */
ccb2c380
MP
472#define REGC(c,s) STMT_START { \
473 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
474} STMT_END
cd439c50 475
fac92740
MJD
476/* Macros for recording node offsets. 20001227 mjd@plover.com
477 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
478 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
479 * Element 0 holds the number n.
07be1b83 480 * Position is 1 indexed.
fac92740
MJD
481 */
482
ccb2c380
MP
483#define Set_Node_Offset_To_R(node,byte) STMT_START { \
484 if (! SIZE_ONLY) { \
485 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 486 __LINE__, (node), (int)(byte))); \
ccb2c380 487 if((node) < 0) { \
551405c4 488 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
489 } else { \
490 RExC_offsets[2*(node)-1] = (byte); \
491 } \
492 } \
493} STMT_END
494
495#define Set_Node_Offset(node,byte) \
496 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
497#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
498
499#define Set_Node_Length_To_R(node,len) STMT_START { \
500 if (! SIZE_ONLY) { \
501 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 502 __LINE__, (int)(node), (int)(len))); \
ccb2c380 503 if((node) < 0) { \
551405c4 504 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
505 } else { \
506 RExC_offsets[2*(node)] = (len); \
507 } \
508 } \
509} STMT_END
510
511#define Set_Node_Length(node,len) \
512 Set_Node_Length_To_R((node)-RExC_emit_start, len)
513#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
514#define Set_Node_Cur_Length(node) \
515 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
516
517/* Get offsets and lengths */
518#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
519#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
520
07be1b83
YO
521#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
522 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
523 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
524} STMT_END
525
526
527#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
528#define EXPERIMENTAL_INPLACESCAN
529#endif
530
1de06328 531#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 532DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
533 PerlIO_printf(Perl_debug_log, \
534 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
535 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
536 (int)(depth)*2, "", \
537 (IV)((data)->pos_min), \
538 (IV)((data)->pos_delta), \
539 (IV)((data)->flags), \
540 (IV)((data)->whilem_c), \
541 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
542 ); \
543 if ((data)->last_found) \
544 PerlIO_printf(Perl_debug_log, \
545 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
546 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
547 SvPVX_const((data)->last_found), \
548 (IV)((data)->last_end), \
549 (IV)((data)->last_start_min), \
550 (IV)((data)->last_start_max), \
551 ((data)->longest && \
552 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
553 SvPVX_const((data)->longest_fixed), \
554 (IV)((data)->offset_fixed), \
555 ((data)->longest && \
556 (data)->longest==&((data)->longest_float)) ? "*" : "", \
557 SvPVX_const((data)->longest_float), \
558 (IV)((data)->offset_float_min), \
559 (IV)((data)->offset_float_max) \
560 ); \
561 PerlIO_printf(Perl_debug_log,"\n"); \
562});
563
acfe0abc 564static void clear_re(pTHX_ void *r);
4327152a 565
653099ff 566/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 567 Update the longest found anchored substring and the longest found
653099ff
GS
568 floating substrings if needed. */
569
4327152a 570STATIC void
1de06328 571S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 572{
e1ec3a88
AL
573 const STRLEN l = CHR_SVLEN(data->last_found);
574 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 575 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 576
c277df42 577 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 578 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
579 if (*data->longest == data->longest_fixed) {
580 data->offset_fixed = l ? data->last_start_min : data->pos_min;
581 if (data->flags & SF_BEFORE_EOL)
b81d288d 582 data->flags
c277df42
IZ
583 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
584 else
585 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
586 data->minlen_fixed=minlenp;
587 data->lookbehind_fixed=0;
a0ed51b3
LW
588 }
589 else {
c277df42 590 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
591 data->offset_float_max = (l
592 ? data->last_start_max
c277df42 593 : data->pos_min + data->pos_delta);
9051bda5
HS
594 if ((U32)data->offset_float_max > (U32)I32_MAX)
595 data->offset_float_max = I32_MAX;
c277df42 596 if (data->flags & SF_BEFORE_EOL)
b81d288d 597 data->flags
c277df42
IZ
598 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
599 else
600 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
601 data->minlen_float=minlenp;
602 data->lookbehind_float=0;
c277df42
IZ
603 }
604 }
605 SvCUR_set(data->last_found, 0);
0eda9292 606 {
a28509cc 607 SV * const sv = data->last_found;
097eb12c
AL
608 if (SvUTF8(sv) && SvMAGICAL(sv)) {
609 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
610 if (mg)
611 mg->mg_len = 0;
612 }
0eda9292 613 }
c277df42
IZ
614 data->last_end = -1;
615 data->flags &= ~SF_BEFORE_EOL;
1de06328 616 DEBUG_STUDYDATA(data,0);
c277df42
IZ
617}
618
653099ff
GS
619/* Can match anything (initialization) */
620STATIC void
097eb12c 621S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 622{
653099ff 623 ANYOF_CLASS_ZERO(cl);
f8bef550 624 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 625 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
626 if (LOC)
627 cl->flags |= ANYOF_LOCALE;
628}
629
630/* Can match anything (initialization) */
631STATIC int
5f66b61c 632S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
633{
634 int value;
635
aaa51d5e 636 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
637 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
638 return 1;
1aa99e6b
IH
639 if (!(cl->flags & ANYOF_UNICODE_ALL))
640 return 0;
10edeb5d 641 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 642 return 0;
653099ff
GS
643 return 1;
644}
645
646/* Can match anything (initialization) */
647STATIC void
097eb12c 648S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 649{
8ecf7187 650 Zero(cl, 1, struct regnode_charclass_class);
653099ff 651 cl->type = ANYOF;
830247a4 652 cl_anything(pRExC_state, cl);
653099ff
GS
653}
654
655STATIC void
097eb12c 656S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 657{
8ecf7187 658 Zero(cl, 1, struct regnode_charclass_class);
653099ff 659 cl->type = ANYOF;
830247a4 660 cl_anything(pRExC_state, cl);
653099ff
GS
661 if (LOC)
662 cl->flags |= ANYOF_LOCALE;
663}
664
665/* 'And' a given class with another one. Can create false positives */
666/* We assume that cl is not inverted */
667STATIC void
5f66b61c 668S_cl_and(struct regnode_charclass_class *cl,
a28509cc 669 const struct regnode_charclass_class *and_with)
653099ff 670{
653099ff
GS
671 if (!(and_with->flags & ANYOF_CLASS)
672 && !(cl->flags & ANYOF_CLASS)
673 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
674 && !(and_with->flags & ANYOF_FOLD)
675 && !(cl->flags & ANYOF_FOLD)) {
676 int i;
677
678 if (and_with->flags & ANYOF_INVERT)
679 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
680 cl->bitmap[i] &= ~and_with->bitmap[i];
681 else
682 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
683 cl->bitmap[i] &= and_with->bitmap[i];
684 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
685 if (!(and_with->flags & ANYOF_EOS))
686 cl->flags &= ~ANYOF_EOS;
1aa99e6b 687
14ebb1a2
JH
688 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
689 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
690 cl->flags &= ~ANYOF_UNICODE_ALL;
691 cl->flags |= ANYOF_UNICODE;
692 ARG_SET(cl, ARG(and_with));
693 }
14ebb1a2
JH
694 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
695 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 696 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
697 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
698 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 699 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
700}
701
702/* 'OR' a given class with another one. Can create false positives */
703/* We assume that cl is not inverted */
704STATIC void
097eb12c 705S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 706{
653099ff
GS
707 if (or_with->flags & ANYOF_INVERT) {
708 /* We do not use
709 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
710 * <= (B1 | !B2) | (CL1 | !CL2)
711 * which is wasteful if CL2 is small, but we ignore CL2:
712 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
713 * XXXX Can we handle case-fold? Unclear:
714 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
715 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
716 */
717 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
718 && !(or_with->flags & ANYOF_FOLD)
719 && !(cl->flags & ANYOF_FOLD) ) {
720 int i;
721
722 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
723 cl->bitmap[i] |= ~or_with->bitmap[i];
724 } /* XXXX: logic is complicated otherwise */
725 else {
830247a4 726 cl_anything(pRExC_state, cl);
653099ff
GS
727 }
728 } else {
729 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
730 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 731 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
732 || (cl->flags & ANYOF_FOLD)) ) {
733 int i;
734
735 /* OR char bitmap and class bitmap separately */
736 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
737 cl->bitmap[i] |= or_with->bitmap[i];
738 if (or_with->flags & ANYOF_CLASS) {
739 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
740 cl->classflags[i] |= or_with->classflags[i];
741 cl->flags |= ANYOF_CLASS;
742 }
743 }
744 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 745 cl_anything(pRExC_state, cl);
653099ff
GS
746 }
747 }
748 if (or_with->flags & ANYOF_EOS)
749 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
750
751 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
752 ARG(cl) != ARG(or_with)) {
753 cl->flags |= ANYOF_UNICODE_ALL;
754 cl->flags &= ~ANYOF_UNICODE;
755 }
756 if (or_with->flags & ANYOF_UNICODE_ALL) {
757 cl->flags |= ANYOF_UNICODE_ALL;
758 cl->flags &= ~ANYOF_UNICODE;
759 }
653099ff
GS
760}
761
a3621e74
YO
762#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
763#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
764#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
765#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
766
3dab1dad
YO
767
768#ifdef DEBUGGING
07be1b83 769/*
3dab1dad
YO
770 dump_trie(trie)
771 dump_trie_interim_list(trie,next_alloc)
772 dump_trie_interim_table(trie,next_alloc)
773
774 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
775 The _interim_ variants are used for debugging the interim
776 tables that are used to generate the final compressed
777 representation which is what dump_trie expects.
778
3dab1dad
YO
779 Part of the reason for their existance is to provide a form
780 of documentation as to how the different representations function.
07be1b83
YO
781
782*/
3dab1dad
YO
783
784/*
785 dump_trie(trie)
786 Dumps the final compressed table form of the trie to Perl_debug_log.
787 Used for debugging make_trie().
788*/
789
790STATIC void
791S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
792{
793 U32 state;
ab3bbdeb
YO
794 SV *sv=sv_newmortal();
795 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
796 GET_RE_DEBUG_FLAGS_DECL;
797
ab3bbdeb 798
3dab1dad
YO
799 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
800 (int)depth * 2 + 2,"",
801 "Match","Base","Ofs" );
802
803 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 804 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 805 if ( tmp ) {
ab3bbdeb
YO
806 PerlIO_printf( Perl_debug_log, "%*s",
807 colwidth,
ddc5bc0f 808 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
809 PL_colors[0], PL_colors[1],
810 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
811 PERL_PV_ESCAPE_FIRSTCHAR
812 )
813 );
3dab1dad
YO
814 }
815 }
816 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
817 (int)depth * 2 + 2,"");
818
819 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 820 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
821 PerlIO_printf( Perl_debug_log, "\n");
822
1e2e3d02 823 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 824 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
825
826 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
827
828 if ( trie->states[ state ].wordnum ) {
829 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
830 } else {
831 PerlIO_printf( Perl_debug_log, "%6s", "" );
832 }
833
834 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
835
836 if ( base ) {
837 U32 ofs = 0;
838
839 while( ( base + ofs < trie->uniquecharcount ) ||
840 ( base + ofs - trie->uniquecharcount < trie->lasttrans
841 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
842 ofs++;
843
844 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
845
846 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
847 if ( ( base + ofs >= trie->uniquecharcount ) &&
848 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
849 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
850 {
ab3bbdeb
YO
851 PerlIO_printf( Perl_debug_log, "%*"UVXf,
852 colwidth,
3dab1dad
YO
853 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
854 } else {
ab3bbdeb 855 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
856 }
857 }
858
859 PerlIO_printf( Perl_debug_log, "]");
860
861 }
862 PerlIO_printf( Perl_debug_log, "\n" );
863 }
864}
865/*
866 dump_trie_interim_list(trie,next_alloc)
867 Dumps a fully constructed but uncompressed trie in list form.
868 List tries normally only are used for construction when the number of
869 possible chars (trie->uniquecharcount) is very high.
870 Used for debugging make_trie().
871*/
872STATIC void
873S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
874{
875 U32 state;
ab3bbdeb
YO
876 SV *sv=sv_newmortal();
877 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
878 GET_RE_DEBUG_FLAGS_DECL;
879 /* print out the table precompression. */
ab3bbdeb
YO
880 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
881 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
882 "------:-----+-----------------\n" );
3dab1dad
YO
883
884 for( state=1 ; state < next_alloc ; state ++ ) {
885 U16 charid;
886
ab3bbdeb 887 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
888 (int)depth * 2 + 2,"", (UV)state );
889 if ( ! trie->states[ state ].wordnum ) {
890 PerlIO_printf( Perl_debug_log, "%5s| ","");
891 } else {
892 PerlIO_printf( Perl_debug_log, "W%4x| ",
893 trie->states[ state ].wordnum
894 );
895 }
896 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 897 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
898 if ( tmp ) {
899 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
900 colwidth,
ddc5bc0f 901 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
902 PL_colors[0], PL_colors[1],
903 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
904 PERL_PV_ESCAPE_FIRSTCHAR
905 ) ,
1e2e3d02
YO
906 TRIE_LIST_ITEM(state,charid).forid,
907 (UV)TRIE_LIST_ITEM(state,charid).newstate
908 );
909 if (!(charid % 10))
664e119d
RGS
910 PerlIO_printf(Perl_debug_log, "\n%*s| ",
911 (int)((depth * 2) + 14), "");
1e2e3d02 912 }
ab3bbdeb
YO
913 }
914 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
915 }
916}
917
918/*
919 dump_trie_interim_table(trie,next_alloc)
920 Dumps a fully constructed but uncompressed trie in table form.
921 This is the normal DFA style state transition table, with a few
922 twists to facilitate compression later.
923 Used for debugging make_trie().
924*/
925STATIC void
926S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
927{
928 U32 state;
929 U16 charid;
ab3bbdeb
YO
930 SV *sv=sv_newmortal();
931 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
932 GET_RE_DEBUG_FLAGS_DECL;
933
934 /*
935 print out the table precompression so that we can do a visual check
936 that they are identical.
937 */
938
939 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
940
941 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 942 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 943 if ( tmp ) {
ab3bbdeb
YO
944 PerlIO_printf( Perl_debug_log, "%*s",
945 colwidth,
ddc5bc0f 946 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
947 PL_colors[0], PL_colors[1],
948 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
949 PERL_PV_ESCAPE_FIRSTCHAR
950 )
951 );
3dab1dad
YO
952 }
953 }
954
955 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
956
957 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 958 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
959 }
960
961 PerlIO_printf( Perl_debug_log, "\n" );
962
963 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
964
965 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
966 (int)depth * 2 + 2,"",
967 (UV)TRIE_NODENUM( state ) );
968
969 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
970 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
971 if (v)
972 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
973 else
974 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
975 }
976 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
977 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
978 } else {
979 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
980 trie->states[ TRIE_NODENUM( state ) ].wordnum );
981 }
982 }
07be1b83 983}
3dab1dad
YO
984
985#endif
986
786e8c11
YO
987/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
988 startbranch: the first branch in the whole branch sequence
989 first : start branch of sequence of branch-exact nodes.
990 May be the same as startbranch
991 last : Thing following the last branch.
992 May be the same as tail.
993 tail : item following the branch sequence
994 count : words in the sequence
995 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
996 depth : indent depth
3dab1dad 997
786e8c11 998Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 999
786e8c11
YO
1000A trie is an N'ary tree where the branches are determined by digital
1001decomposition of the key. IE, at the root node you look up the 1st character and
1002follow that branch repeat until you find the end of the branches. Nodes can be
1003marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1004
786e8c11 1005 /he|she|his|hers/
72f13be8 1006
786e8c11
YO
1007would convert into the following structure. Numbers represent states, letters
1008following numbers represent valid transitions on the letter from that state, if
1009the number is in square brackets it represents an accepting state, otherwise it
1010will be in parenthesis.
07be1b83 1011
786e8c11
YO
1012 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1013 | |
1014 | (2)
1015 | |
1016 (1) +-i->(6)-+-s->[7]
1017 |
1018 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1019
786e8c11
YO
1020 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1021
1022This shows that when matching against the string 'hers' we will begin at state 1
1023read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1024then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1025is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1026single traverse. We store a mapping from accepting to state to which word was
1027matched, and then when we have multiple possibilities we try to complete the
1028rest of the regex in the order in which they occured in the alternation.
1029
1030The only prior NFA like behaviour that would be changed by the TRIE support is
1031the silent ignoring of duplicate alternations which are of the form:
1032
1033 / (DUPE|DUPE) X? (?{ ... }) Y /x
1034
1035Thus EVAL blocks follwing a trie may be called a different number of times with
1036and without the optimisation. With the optimisations dupes will be silently
1037ignored. This inconsistant behaviour of EVAL type nodes is well established as
1038the following demonstrates:
1039
1040 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1041
1042which prints out 'word' three times, but
1043
1044 'words'=~/(word|word|word)(?{ print $1 })S/
1045
1046which doesnt print it out at all. This is due to other optimisations kicking in.
1047
1048Example of what happens on a structural level:
1049
1050The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1051
1052 1: CURLYM[1] {1,32767}(18)
1053 5: BRANCH(8)
1054 6: EXACT <ac>(16)
1055 8: BRANCH(11)
1056 9: EXACT <ad>(16)
1057 11: BRANCH(14)
1058 12: EXACT <ab>(16)
1059 16: SUCCEED(0)
1060 17: NOTHING(18)
1061 18: END(0)
1062
1063This would be optimizable with startbranch=5, first=5, last=16, tail=16
1064and should turn into:
1065
1066 1: CURLYM[1] {1,32767}(18)
1067 5: TRIE(16)
1068 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1069 <ac>
1070 <ad>
1071 <ab>
1072 16: SUCCEED(0)
1073 17: NOTHING(18)
1074 18: END(0)
1075
1076Cases where tail != last would be like /(?foo|bar)baz/:
1077
1078 1: BRANCH(4)
1079 2: EXACT <foo>(8)
1080 4: BRANCH(7)
1081 5: EXACT <bar>(8)
1082 7: TAIL(8)
1083 8: EXACT <baz>(10)
1084 10: END(0)
1085
1086which would be optimizable with startbranch=1, first=1, last=7, tail=8
1087and would end up looking like:
1088
1089 1: TRIE(8)
1090 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1091 <foo>
1092 <bar>
1093 7: TAIL(8)
1094 8: EXACT <baz>(10)
1095 10: END(0)
1096
1097 d = uvuni_to_utf8_flags(d, uv, 0);
1098
1099is the recommended Unicode-aware way of saying
1100
1101 *(d++) = uv;
1102*/
1103
1e2e3d02 1104#define TRIE_STORE_REVCHAR \
786e8c11 1105 STMT_START { \
1e2e3d02 1106 SV *tmp = newSVpvs(""); \
786e8c11 1107 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1108 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1109 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1110 } STMT_END
1111
1112#define TRIE_READ_CHAR STMT_START { \
1113 wordlen++; \
1114 if ( UTF ) { \
1115 if ( folder ) { \
1116 if ( foldlen > 0 ) { \
1117 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1118 foldlen -= len; \
1119 scan += len; \
1120 len = 0; \
1121 } else { \
1122 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1123 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1124 foldlen -= UNISKIP( uvc ); \
1125 scan = foldbuf + UNISKIP( uvc ); \
1126 } \
1127 } else { \
1128 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1129 } \
1130 } else { \
1131 uvc = (U32)*uc; \
1132 len = 1; \
1133 } \
1134} STMT_END
1135
1136
1137
1138#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1139 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1140 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1141 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1142 } \
1143 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1144 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1145 TRIE_LIST_CUR( state )++; \
1146} STMT_END
07be1b83 1147
786e8c11
YO
1148#define TRIE_LIST_NEW(state) STMT_START { \
1149 Newxz( trie->states[ state ].trans.list, \
1150 4, reg_trie_trans_le ); \
1151 TRIE_LIST_CUR( state ) = 1; \
1152 TRIE_LIST_LEN( state ) = 4; \
1153} STMT_END
07be1b83 1154
786e8c11
YO
1155#define TRIE_HANDLE_WORD(state) STMT_START { \
1156 U16 dupe= trie->states[ state ].wordnum; \
1157 regnode * const noper_next = regnext( noper ); \
1158 \
1159 if (trie->wordlen) \
1160 trie->wordlen[ curword ] = wordlen; \
1161 DEBUG_r({ \
1162 /* store the word for dumping */ \
1163 SV* tmp; \
1164 if (OP(noper) != NOTHING) \
1165 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1166 else \
1167 tmp = newSVpvn( "", 0 ); \
1168 if ( UTF ) SvUTF8_on( tmp ); \
1169 av_push( trie->words, tmp ); \
1170 }); \
1171 \
1172 curword++; \
1173 \
1174 if ( noper_next < tail ) { \
1175 if (!trie->jump) \
1176 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1177 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1178 if (!jumper) \
1179 jumper = noper_next; \
1180 if (!nextbranch) \
1181 nextbranch= regnext(cur); \
1182 } \
1183 \
1184 if ( dupe ) { \
1185 /* So it's a dupe. This means we need to maintain a */\
1186 /* linked-list from the first to the next. */\
1187 /* we only allocate the nextword buffer when there */\
1188 /* a dupe, so first time we have to do the allocation */\
1189 if (!trie->nextword) \
1190 Newxz( trie->nextword, word_count + 1, U16); \
1191 while ( trie->nextword[dupe] ) \
1192 dupe= trie->nextword[dupe]; \
1193 trie->nextword[dupe]= curword; \
1194 } else { \
1195 /* we haven't inserted this word yet. */ \
1196 trie->states[ state ].wordnum = curword; \
1197 } \
1198} STMT_END
07be1b83 1199
3dab1dad 1200
786e8c11
YO
1201#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1202 ( ( base + charid >= ucharcount \
1203 && base + charid < ubound \
1204 && state == trie->trans[ base - ucharcount + charid ].check \
1205 && trie->trans[ base - ucharcount + charid ].next ) \
1206 ? trie->trans[ base - ucharcount + charid ].next \
1207 : ( state==1 ? special : 0 ) \
1208 )
3dab1dad 1209
786e8c11
YO
1210#define MADE_TRIE 1
1211#define MADE_JUMP_TRIE 2
1212#define MADE_EXACT_TRIE 4
3dab1dad 1213
a3621e74 1214STATIC I32
786e8c11 1215S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1216{
27da23d5 1217 dVAR;
a3621e74
YO
1218 /* first pass, loop through and scan words */
1219 reg_trie_data *trie;
1220 regnode *cur;
9f7f3913 1221 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1222 STRLEN len = 0;
1223 UV uvc = 0;
1224 U16 curword = 0;
1225 U32 next_alloc = 0;
786e8c11
YO
1226 regnode *jumper = NULL;
1227 regnode *nextbranch = NULL;
7f69552c 1228 regnode *convert = NULL;
a3621e74 1229 /* we just use folder as a flag in utf8 */
e1ec3a88 1230 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1231 ? PL_fold
1232 : ( flags == EXACTFL
1233 ? PL_fold_locale
1234 : NULL
1235 )
1236 );
1237
e1ec3a88 1238 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1239 SV *re_trie_maxbuff;
3dab1dad
YO
1240#ifndef DEBUGGING
1241 /* these are only used during construction but are useful during
8e11feef 1242 * debugging so we store them in the struct when debugging.
8e11feef 1243 */
3dab1dad 1244 STRLEN trie_charcount=0;
3dab1dad
YO
1245 AV *trie_revcharmap;
1246#endif
a3621e74 1247 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1248#ifndef DEBUGGING
1249 PERL_UNUSED_ARG(depth);
1250#endif
a3621e74 1251
a02a5408 1252 Newxz( trie, 1, reg_trie_data );
a3621e74 1253 trie->refcount = 1;
3dab1dad 1254 trie->startstate = 1;
786e8c11 1255 trie->wordcount = word_count;
a3621e74 1256 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1257 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1258 if (!(UTF && folder))
1259 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1260 DEBUG_r({
1261 trie->words = newAV();
a3621e74 1262 });
3dab1dad 1263 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1264
0111c4fd 1265 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1266 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1267 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1268 }
3dab1dad
YO
1269 DEBUG_OPTIMISE_r({
1270 PerlIO_printf( Perl_debug_log,
786e8c11 1271 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1272 (int)depth * 2 + 2, "",
1273 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1274 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1275 (int)depth);
3dab1dad 1276 });
7f69552c
YO
1277
1278 /* Find the node we are going to overwrite */
1279 if ( first == startbranch && OP( last ) != BRANCH ) {
1280 /* whole branch chain */
1281 convert = first;
1282 } else {
1283 /* branch sub-chain */
1284 convert = NEXTOPER( first );
1285 }
1286
a3621e74
YO
1287 /* -- First loop and Setup --
1288
1289 We first traverse the branches and scan each word to determine if it
1290 contains widechars, and how many unique chars there are, this is
1291 important as we have to build a table with at least as many columns as we
1292 have unique chars.
1293
1294 We use an array of integers to represent the character codes 0..255
1295 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1296 native representation of the character value as the key and IV's for the
1297 coded index.
1298
1299 *TODO* If we keep track of how many times each character is used we can
1300 remap the columns so that the table compression later on is more
1301 efficient in terms of memory by ensuring most common value is in the
1302 middle and the least common are on the outside. IMO this would be better
1303 than a most to least common mapping as theres a decent chance the most
1304 common letter will share a node with the least common, meaning the node
1305 will not be compressable. With a middle is most common approach the worst
1306 case is when we have the least common nodes twice.
1307
1308 */
1309
a3621e74 1310 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1311 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1312 const U8 *uc = (U8*)STRING( noper );
a28509cc 1313 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1314 STRLEN foldlen = 0;
1315 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1316 const U8 *scan = (U8*)NULL;
07be1b83 1317 U32 wordlen = 0; /* required init */
3dab1dad 1318 STRLEN chars=0;
a3621e74 1319
3dab1dad
YO
1320 if (OP(noper) == NOTHING) {
1321 trie->minlen= 0;
1322 continue;
1323 }
1324 if (trie->bitmap) {
1325 TRIE_BITMAP_SET(trie,*uc);
1326 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1327 }
a3621e74 1328 for ( ; uc < e ; uc += len ) {
3dab1dad 1329 TRIE_CHARCOUNT(trie)++;
a3621e74 1330 TRIE_READ_CHAR;
3dab1dad 1331 chars++;
a3621e74
YO
1332 if ( uvc < 256 ) {
1333 if ( !trie->charmap[ uvc ] ) {
1334 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1335 if ( folder )
1336 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1337 TRIE_STORE_REVCHAR;
a3621e74
YO
1338 }
1339 } else {
1340 SV** svpp;
1341 if ( !trie->widecharmap )
1342 trie->widecharmap = newHV();
1343
1344 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1345
1346 if ( !svpp )
e4584336 1347 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1348
1349 if ( !SvTRUE( *svpp ) ) {
1350 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1351 TRIE_STORE_REVCHAR;
a3621e74
YO
1352 }
1353 }
1354 }
3dab1dad
YO
1355 if( cur == first ) {
1356 trie->minlen=chars;
1357 trie->maxlen=chars;
1358 } else if (chars < trie->minlen) {
1359 trie->minlen=chars;
1360 } else if (chars > trie->maxlen) {
1361 trie->maxlen=chars;
1362 }
1363
a3621e74
YO
1364 } /* end first pass */
1365 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1366 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1367 (int)depth * 2 + 2,"",
85c3142d 1368 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1369 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1370 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1371 );
786e8c11 1372 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1373
1374 /*
1375 We now know what we are dealing with in terms of unique chars and
1376 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1377 representation using a flat table will take. If it's over a reasonable
1378 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1379 conservative but potentially much slower representation using an array
1380 of lists.
1381
1382 At the end we convert both representations into the same compressed
1383 form that will be used in regexec.c for matching with. The latter
1384 is a form that cannot be used to construct with but has memory
1385 properties similar to the list form and access properties similar
1386 to the table form making it both suitable for fast searches and
1387 small enough that its feasable to store for the duration of a program.
1388
1389 See the comment in the code where the compressed table is produced
1390 inplace from the flat tabe representation for an explanation of how
1391 the compression works.
1392
1393 */
1394
1395
3dab1dad 1396 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1397 /*
1398 Second Pass -- Array Of Lists Representation
1399
1400 Each state will be represented by a list of charid:state records
1401 (reg_trie_trans_le) the first such element holds the CUR and LEN
1402 points of the allocated array. (See defines above).
1403
1404 We build the initial structure using the lists, and then convert
1405 it into the compressed table form which allows faster lookups
1406 (but cant be modified once converted).
a3621e74
YO
1407 */
1408
a3621e74
YO
1409 STRLEN transcount = 1;
1410
1e2e3d02
YO
1411 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1412 "%*sCompiling trie using list compiler\n",
1413 (int)depth * 2 + 2, ""));
1414
3dab1dad 1415 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1416 TRIE_LIST_NEW(1);
1417 next_alloc = 2;
1418
1419 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1420
c445ea15
AL
1421 regnode * const noper = NEXTOPER( cur );
1422 U8 *uc = (U8*)STRING( noper );
1423 const U8 * const e = uc + STR_LEN( noper );
1424 U32 state = 1; /* required init */
1425 U16 charid = 0; /* sanity init */
1426 U8 *scan = (U8*)NULL; /* sanity init */
1427 STRLEN foldlen = 0; /* required init */
07be1b83 1428 U32 wordlen = 0; /* required init */
c445ea15
AL
1429 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1430
3dab1dad 1431 if (OP(noper) != NOTHING) {
786e8c11 1432 for ( ; uc < e ; uc += len ) {
c445ea15 1433
786e8c11 1434 TRIE_READ_CHAR;
c445ea15 1435
786e8c11
YO
1436 if ( uvc < 256 ) {
1437 charid = trie->charmap[ uvc ];
c445ea15 1438 } else {
786e8c11
YO
1439 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1440 if ( !svpp ) {
1441 charid = 0;
1442 } else {
1443 charid=(U16)SvIV( *svpp );
1444 }
c445ea15 1445 }
786e8c11
YO
1446 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1447 if ( charid ) {
a3621e74 1448
786e8c11
YO
1449 U16 check;
1450 U32 newstate = 0;
a3621e74 1451
786e8c11
YO
1452 charid--;
1453 if ( !trie->states[ state ].trans.list ) {
1454 TRIE_LIST_NEW( state );
c445ea15 1455 }
786e8c11
YO
1456 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1457 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1458 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1459 break;
1460 }
1461 }
1462 if ( ! newstate ) {
1463 newstate = next_alloc++;
1464 TRIE_LIST_PUSH( state, charid, newstate );
1465 transcount++;
1466 }
1467 state = newstate;
1468 } else {
1469 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1470 }
a28509cc 1471 }
c445ea15 1472 }
3dab1dad 1473 TRIE_HANDLE_WORD(state);
a3621e74
YO
1474
1475 } /* end second pass */
1476
1e2e3d02
YO
1477 /* next alloc is the NEXT state to be allocated */
1478 trie->statecount = next_alloc;
a3621e74
YO
1479 Renew( trie->states, next_alloc, reg_trie_state );
1480
3dab1dad
YO
1481 /* and now dump it out before we compress it */
1482 DEBUG_TRIE_COMPILE_MORE_r(
1483 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1484 );
a3621e74 1485
a02a5408 1486 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1487 {
1488 U32 state;
a3621e74
YO
1489 U32 tp = 0;
1490 U32 zp = 0;
1491
1492
1493 for( state=1 ; state < next_alloc ; state ++ ) {
1494 U32 base=0;
1495
1496 /*
1497 DEBUG_TRIE_COMPILE_MORE_r(
1498 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1499 );
1500 */
1501
1502 if (trie->states[state].trans.list) {
1503 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1504 U16 maxid=minid;
a28509cc 1505 U16 idx;
a3621e74
YO
1506
1507 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1508 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1509 if ( forid < minid ) {
1510 minid=forid;
1511 } else if ( forid > maxid ) {
1512 maxid=forid;
1513 }
a3621e74
YO
1514 }
1515 if ( transcount < tp + maxid - minid + 1) {
1516 transcount *= 2;
1517 Renew( trie->trans, transcount, reg_trie_trans );
1518 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1519 }
1520 base = trie->uniquecharcount + tp - minid;
1521 if ( maxid == minid ) {
1522 U32 set = 0;
1523 for ( ; zp < tp ; zp++ ) {
1524 if ( ! trie->trans[ zp ].next ) {
1525 base = trie->uniquecharcount + zp - minid;
1526 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1527 trie->trans[ zp ].check = state;
1528 set = 1;
1529 break;
1530 }
1531 }
1532 if ( !set ) {
1533 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1534 trie->trans[ tp ].check = state;
1535 tp++;
1536 zp = tp;
1537 }
1538 } else {
1539 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1540 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1541 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1542 trie->trans[ tid ].check = state;
1543 }
1544 tp += ( maxid - minid + 1 );
1545 }
1546 Safefree(trie->states[ state ].trans.list);
1547 }
1548 /*
1549 DEBUG_TRIE_COMPILE_MORE_r(
1550 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1551 );
1552 */
1553 trie->states[ state ].trans.base=base;
1554 }
cc601c31 1555 trie->lasttrans = tp + 1;
a3621e74
YO
1556 }
1557 } else {
1558 /*
1559 Second Pass -- Flat Table Representation.
1560
1561 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1562 We know that we will need Charcount+1 trans at most to store the data
1563 (one row per char at worst case) So we preallocate both structures
1564 assuming worst case.
1565
1566 We then construct the trie using only the .next slots of the entry
1567 structs.
1568
1569 We use the .check field of the first entry of the node temporarily to
1570 make compression both faster and easier by keeping track of how many non
1571 zero fields are in the node.
1572
1573 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1574 transition.
1575
1576 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1577 number representing the first entry of the node, and state as a
1578 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1579 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1580 are 2 entrys per node. eg:
1581
1582 A B A B
1583 1. 2 4 1. 3 7
1584 2. 0 3 3. 0 5
1585 3. 0 0 5. 0 0
1586 4. 0 0 7. 0 0
1587
1588 The table is internally in the right hand, idx form. However as we also
1589 have to deal with the states array which is indexed by nodenum we have to
1590 use TRIE_NODENUM() to convert.
1591
1592 */
1e2e3d02
YO
1593 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1594 "%*sCompiling trie using table compiler\n",
1595 (int)depth * 2 + 2, ""));
3dab1dad
YO
1596
1597 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1598 reg_trie_trans );
3dab1dad 1599 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1600 next_alloc = trie->uniquecharcount + 1;
1601
3dab1dad 1602
a3621e74
YO
1603 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1604
c445ea15 1605 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1606 const U8 *uc = (U8*)STRING( noper );
1607 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1608
1609 U32 state = 1; /* required init */
1610
1611 U16 charid = 0; /* sanity init */
1612 U32 accept_state = 0; /* sanity init */
1613 U8 *scan = (U8*)NULL; /* sanity init */
1614
1615 STRLEN foldlen = 0; /* required init */
07be1b83 1616 U32 wordlen = 0; /* required init */
a3621e74
YO
1617 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1618
3dab1dad 1619 if ( OP(noper) != NOTHING ) {
786e8c11 1620 for ( ; uc < e ; uc += len ) {
a3621e74 1621
786e8c11 1622 TRIE_READ_CHAR;
a3621e74 1623
786e8c11
YO
1624 if ( uvc < 256 ) {
1625 charid = trie->charmap[ uvc ];
1626 } else {
1627 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1628 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1629 }
786e8c11
YO
1630 if ( charid ) {
1631 charid--;
1632 if ( !trie->trans[ state + charid ].next ) {
1633 trie->trans[ state + charid ].next = next_alloc;
1634 trie->trans[ state ].check++;
1635 next_alloc += trie->uniquecharcount;
1636 }
1637 state = trie->trans[ state + charid ].next;
1638 } else {
1639 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1640 }
1641 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1642 }
a3621e74 1643 }
3dab1dad
YO
1644 accept_state = TRIE_NODENUM( state );
1645 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1646
1647 } /* end second pass */
1648
3dab1dad
YO
1649 /* and now dump it out before we compress it */
1650 DEBUG_TRIE_COMPILE_MORE_r(
1651 dump_trie_interim_table(trie,next_alloc,depth+1)
1652 );
a3621e74 1653
a3621e74
YO
1654 {
1655 /*
1656 * Inplace compress the table.*
1657
1658 For sparse data sets the table constructed by the trie algorithm will
1659 be mostly 0/FAIL transitions or to put it another way mostly empty.
1660 (Note that leaf nodes will not contain any transitions.)
1661
1662 This algorithm compresses the tables by eliminating most such
1663 transitions, at the cost of a modest bit of extra work during lookup:
1664
1665 - Each states[] entry contains a .base field which indicates the
1666 index in the state[] array wheres its transition data is stored.
1667
1668 - If .base is 0 there are no valid transitions from that node.
1669
1670 - If .base is nonzero then charid is added to it to find an entry in
1671 the trans array.
1672
1673 -If trans[states[state].base+charid].check!=state then the
1674 transition is taken to be a 0/Fail transition. Thus if there are fail
1675 transitions at the front of the node then the .base offset will point
1676 somewhere inside the previous nodes data (or maybe even into a node
1677 even earlier), but the .check field determines if the transition is
1678 valid.
1679
786e8c11 1680 XXX - wrong maybe?
a3621e74
YO
1681 The following process inplace converts the table to the compressed
1682 table: We first do not compress the root node 1,and mark its all its
1683 .check pointers as 1 and set its .base pointer as 1 as well. This
1684 allows to do a DFA construction from the compressed table later, and
1685 ensures that any .base pointers we calculate later are greater than
1686 0.
1687
1688 - We set 'pos' to indicate the first entry of the second node.
1689
1690 - We then iterate over the columns of the node, finding the first and
1691 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1692 and set the .check pointers accordingly, and advance pos
1693 appropriately and repreat for the next node. Note that when we copy
1694 the next pointers we have to convert them from the original
1695 NODEIDX form to NODENUM form as the former is not valid post
1696 compression.
1697
1698 - If a node has no transitions used we mark its base as 0 and do not
1699 advance the pos pointer.
1700
1701 - If a node only has one transition we use a second pointer into the
1702 structure to fill in allocated fail transitions from other states.
1703 This pointer is independent of the main pointer and scans forward
1704 looking for null transitions that are allocated to a state. When it
1705 finds one it writes the single transition into the "hole". If the
786e8c11 1706 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1707
1708 - Once compressed we can Renew/realloc the structures to release the
1709 excess space.
1710
1711 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1712 specifically Fig 3.47 and the associated pseudocode.
1713
1714 demq
1715 */
a3b680e6 1716 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1717 U32 state, charid;
a3621e74 1718 U32 pos = 0, zp=0;
1e2e3d02 1719 trie->statecount = laststate;
a3621e74
YO
1720
1721 for ( state = 1 ; state < laststate ; state++ ) {
1722 U8 flag = 0;
a28509cc
AL
1723 const U32 stateidx = TRIE_NODEIDX( state );
1724 const U32 o_used = trie->trans[ stateidx ].check;
1725 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1726 trie->trans[ stateidx ].check = 0;
1727
1728 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1729 if ( flag || trie->trans[ stateidx + charid ].next ) {
1730 if ( trie->trans[ stateidx + charid ].next ) {
1731 if (o_used == 1) {
1732 for ( ; zp < pos ; zp++ ) {
1733 if ( ! trie->trans[ zp ].next ) {
1734 break;
1735 }
1736 }
1737 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1738 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1739 trie->trans[ zp ].check = state;
1740 if ( ++zp > pos ) pos = zp;
1741 break;
1742 }
1743 used--;
1744 }
1745 if ( !flag ) {
1746 flag = 1;
1747 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1748 }
1749 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1750 trie->trans[ pos ].check = state;
1751 pos++;
1752 }
1753 }
1754 }
cc601c31 1755 trie->lasttrans = pos + 1;
1e2e3d02 1756 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1757 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1758 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1759 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1760 (int)depth * 2 + 2,"",
1761 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1762 (IV)next_alloc,
1763 (IV)pos,
a3621e74
YO
1764 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1765 );
1766
1767 } /* end table compress */
1768 }
1e2e3d02
YO
1769 DEBUG_TRIE_COMPILE_MORE_r(
1770 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1771 (int)depth * 2 + 2, "",
1772 (UV)trie->statecount,
1773 (UV)trie->lasttrans)
1774 );
cc601c31
YO
1775 /* resize the trans array to remove unused space */
1776 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1777
3dab1dad
YO
1778 /* and now dump out the compressed format */
1779 DEBUG_TRIE_COMPILE_r(
1780 dump_trie(trie,depth+1)
1781 );
07be1b83 1782
3dab1dad 1783 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1784 U8 nodetype =(U8)(flags & 0xFF);
1785 char *str=NULL;
786e8c11 1786
07be1b83 1787#ifdef DEBUGGING
e62cc96a 1788 regnode *optimize = NULL;
b57a0404
JH
1789 U32 mjd_offset = 0;
1790 U32 mjd_nodelen = 0;
07be1b83 1791#endif
a3621e74 1792 /*
3dab1dad
YO
1793 This means we convert either the first branch or the first Exact,
1794 depending on whether the thing following (in 'last') is a branch
1795 or not and whther first is the startbranch (ie is it a sub part of
1796 the alternation or is it the whole thing.)
1797 Assuming its a sub part we conver the EXACT otherwise we convert
1798 the whole branch sequence, including the first.
a3621e74 1799 */
3dab1dad 1800 /* Find the node we are going to overwrite */
7f69552c 1801 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1802 /* branch sub-chain */
3dab1dad 1803 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1804 DEBUG_r({
1805 mjd_offset= Node_Offset((convert));
1806 mjd_nodelen= Node_Length((convert));
1807 });
7f69552c
YO
1808 /* whole branch chain */
1809 } else {
1810 DEBUG_r({
1811 const regnode *nop = NEXTOPER( convert );
1812 mjd_offset= Node_Offset((nop));
1813 mjd_nodelen= Node_Length((nop));
1814 });
07be1b83 1815 }
7f69552c 1816
07be1b83
YO
1817 DEBUG_OPTIMISE_r(
1818 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1819 (int)depth * 2 + 2, "",
786e8c11 1820 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1821 );
a3621e74 1822
3dab1dad
YO
1823 /* But first we check to see if there is a common prefix we can
1824 split out as an EXACT and put in front of the TRIE node. */
1825 trie->startstate= 1;
786e8c11 1826 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1827 U32 state;
1e2e3d02 1828 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1829 U32 ofs = 0;
8e11feef
RGS
1830 I32 idx = -1;
1831 U32 count = 0;
1832 const U32 base = trie->states[ state ].trans.base;
a3621e74 1833
3dab1dad 1834 if ( trie->states[state].wordnum )
8e11feef 1835 count = 1;
a3621e74 1836
8e11feef 1837 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1838 if ( ( base + ofs >= trie->uniquecharcount ) &&
1839 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1840 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1841 {
3dab1dad 1842 if ( ++count > 1 ) {
8e11feef 1843 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1844 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1845 if ( state == 1 ) break;
3dab1dad
YO
1846 if ( count == 2 ) {
1847 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1848 DEBUG_OPTIMISE_r(
8e11feef
RGS
1849 PerlIO_printf(Perl_debug_log,
1850 "%*sNew Start State=%"UVuf" Class: [",
1851 (int)depth * 2 + 2, "",
786e8c11 1852 (UV)state));
be8e71aa
YO
1853 if (idx >= 0) {
1854 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1855 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1856
3dab1dad 1857 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1858 if ( folder )
1859 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1860 DEBUG_OPTIMISE_r(
07be1b83 1861 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1862 );
8e11feef
RGS
1863 }
1864 }
1865 TRIE_BITMAP_SET(trie,*ch);
1866 if ( folder )
1867 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1868 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1869 }
1870 idx = ofs;
1871 }
3dab1dad
YO
1872 }
1873 if ( count == 1 ) {
1874 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1875 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1876 DEBUG_OPTIMISE_r(
8e11feef
RGS
1877 PerlIO_printf( Perl_debug_log,
1878 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1879 (int)depth * 2 + 2, "",
786e8c11 1880 (UV)state, (UV)idx, ch)
3dab1dad
YO
1881 );
1882 if ( state==1 ) {
1883 OP( convert ) = nodetype;
1884 str=STRING(convert);
1885 STR_LEN(convert)=0;
1886 }
1887 *str++=*ch;
1888 STR_LEN(convert)++;
a3621e74 1889
8e11feef 1890 } else {
f9049ba1 1891#ifdef DEBUGGING
8e11feef
RGS
1892 if (state>1)
1893 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1894#endif
8e11feef
RGS
1895 break;
1896 }
1897 }
3dab1dad 1898 if (str) {
8e11feef 1899 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1900 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1901 trie->startstate = state;
07be1b83
YO
1902 trie->minlen -= (state - 1);
1903 trie->maxlen -= (state - 1);
1904 DEBUG_r({
1905 regnode *fix = convert;
1906 mjd_nodelen++;
1907 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1908 while( ++fix < n ) {
1909 Set_Node_Offset_Length(fix, 0, 0);
1910 }
1911 });
8e11feef
RGS
1912 if (trie->maxlen) {
1913 convert = n;
1914 } else {
3dab1dad 1915 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1916 DEBUG_r(optimize= n);
3dab1dad
YO
1917 }
1918 }
1919 }
a5ca303d
YO
1920 if (!jumper)
1921 jumper = last;
3dab1dad 1922 if ( trie->maxlen ) {
8e11feef
RGS
1923 NEXT_OFF( convert ) = (U16)(tail - convert);
1924 ARG_SET( convert, data_slot );
786e8c11
YO
1925 /* Store the offset to the first unabsorbed branch in
1926 jump[0], which is otherwise unused by the jump logic.
1927 We use this when dumping a trie and during optimisation. */
1928 if (trie->jump)
7f69552c 1929 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1930
786e8c11
YO
1931 /* XXXX */
1932 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1933 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1934 {
1935 OP( convert ) = TRIEC;
1936 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1937 Safefree(trie->bitmap);
1938 trie->bitmap= NULL;
1939 } else
1940 OP( convert ) = TRIE;
a3621e74 1941
3dab1dad
YO
1942 /* store the type in the flags */
1943 convert->flags = nodetype;
a5ca303d
YO
1944 DEBUG_r({
1945 optimize = convert
1946 + NODE_STEP_REGNODE
1947 + regarglen[ OP( convert ) ];
1948 });
1949 /* XXX We really should free up the resource in trie now,
1950 as we won't use them - (which resources?) dmq */
3dab1dad 1951 }
a3621e74 1952 /* needed for dumping*/
e62cc96a 1953 DEBUG_r(if (optimize) {
07be1b83 1954 regnode *opt = convert;
e62cc96a 1955 while ( ++opt < optimize) {
07be1b83
YO
1956 Set_Node_Offset_Length(opt,0,0);
1957 }
786e8c11
YO
1958 /*
1959 Try to clean up some of the debris left after the
1960 optimisation.
a3621e74 1961 */
786e8c11 1962 while( optimize < jumper ) {
07be1b83 1963 mjd_nodelen += Node_Length((optimize));
a3621e74 1964 OP( optimize ) = OPTIMIZED;
07be1b83 1965 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1966 optimize++;
1967 }
07be1b83 1968 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1969 });
1970 } /* end node insert */
07be1b83 1971#ifndef DEBUGGING
6e8b4190 1972 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1973#endif
786e8c11
YO
1974 return trie->jump
1975 ? MADE_JUMP_TRIE
1976 : trie->startstate>1
1977 ? MADE_EXACT_TRIE
1978 : MADE_TRIE;
1979}
1980
1981STATIC void
1982S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1983{
1984/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1985
1986 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1987 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1988 ISBN 0-201-10088-6
1989
1990 We find the fail state for each state in the trie, this state is the longest proper
1991 suffix of the current states 'word' that is also a proper prefix of another word in our
1992 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1993 the DFA not to have to restart after its tried and failed a word at a given point, it
1994 simply continues as though it had been matching the other word in the first place.
1995 Consider
1996 'abcdgu'=~/abcdefg|cdgu/
1997 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1998 fail, which would bring use to the state representing 'd' in the second word where we would
1999 try 'g' and succeed, prodceding to match 'cdgu'.
2000 */
2001 /* add a fail transition */
2002 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2003 U32 *q;
2004 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2005 const U32 numstates = trie->statecount;
786e8c11
YO
2006 const U32 ubound = trie->lasttrans + ucharcount;
2007 U32 q_read = 0;
2008 U32 q_write = 0;
2009 U32 charid;
2010 U32 base = trie->states[ 1 ].trans.base;
2011 U32 *fail;
2012 reg_ac_data *aho;
2013 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2014 GET_RE_DEBUG_FLAGS_DECL;
2015#ifndef DEBUGGING
2016 PERL_UNUSED_ARG(depth);
2017#endif
2018
2019
2020 ARG_SET( stclass, data_slot );
2021 Newxz( aho, 1, reg_ac_data );
2022 RExC_rx->data->data[ data_slot ] = (void*)aho;
2023 aho->trie=trie;
2024 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2025 numstates * sizeof(reg_trie_state));
786e8c11
YO
2026 Newxz( q, numstates, U32);
2027 Newxz( aho->fail, numstates, U32 );
2028 aho->refcount = 1;
2029 fail = aho->fail;
2030 /* initialize fail[0..1] to be 1 so that we always have
2031 a valid final fail state */
2032 fail[ 0 ] = fail[ 1 ] = 1;
2033
2034 for ( charid = 0; charid < ucharcount ; charid++ ) {
2035 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2036 if ( newstate ) {
2037 q[ q_write ] = newstate;
2038 /* set to point at the root */
2039 fail[ q[ q_write++ ] ]=1;
2040 }
2041 }
2042 while ( q_read < q_write) {
2043 const U32 cur = q[ q_read++ % numstates ];
2044 base = trie->states[ cur ].trans.base;
2045
2046 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2047 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2048 if (ch_state) {
2049 U32 fail_state = cur;
2050 U32 fail_base;
2051 do {
2052 fail_state = fail[ fail_state ];
2053 fail_base = aho->states[ fail_state ].trans.base;
2054 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2055
2056 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2057 fail[ ch_state ] = fail_state;
2058 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2059 {
2060 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2061 }
2062 q[ q_write++ % numstates] = ch_state;
2063 }
2064 }
2065 }
2066 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2067 when we fail in state 1, this allows us to use the
2068 charclass scan to find a valid start char. This is based on the principle
2069 that theres a good chance the string being searched contains lots of stuff
2070 that cant be a start char.
2071 */
2072 fail[ 0 ] = fail[ 1 ] = 0;
2073 DEBUG_TRIE_COMPILE_r({
1e2e3d02
YO
2074 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2075 (int)(depth * 2), "", numstates
2076 );
786e8c11
YO
2077 for( q_read=1; q_read<numstates; q_read++ ) {
2078 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2079 }
2080 PerlIO_printf(Perl_debug_log, "\n");
2081 });
2082 Safefree(q);
2083 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2084}
2085
786e8c11 2086
a3621e74 2087/*
5d1c421c
JH
2088 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2089 * These need to be revisited when a newer toolchain becomes available.
2090 */
2091#if defined(__sparc64__) && defined(__GNUC__)
2092# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2093# undef SPARC64_GCC_WORKAROUND
2094# define SPARC64_GCC_WORKAROUND 1
2095# endif
2096#endif
2097
07be1b83
YO
2098#define DEBUG_PEEP(str,scan,depth) \
2099 DEBUG_OPTIMISE_r({ \
2100 SV * const mysv=sv_newmortal(); \
2101 regnode *Next = regnext(scan); \
2102 regprop(RExC_rx, mysv, scan); \
7f69552c 2103 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2104 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2105 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2106 });
2107
1de06328
YO
2108
2109
2110
2111
07be1b83
YO
2112#define JOIN_EXACT(scan,min,flags) \
2113 if (PL_regkind[OP(scan)] == EXACT) \
2114 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2115
be8e71aa 2116STATIC U32
07be1b83
YO
2117S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2118 /* Merge several consecutive EXACTish nodes into one. */
2119 regnode *n = regnext(scan);
2120 U32 stringok = 1;
2121 regnode *next = scan + NODE_SZ_STR(scan);
2122 U32 merged = 0;
2123 U32 stopnow = 0;
2124#ifdef DEBUGGING
2125 regnode *stop = scan;
72f13be8 2126 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2127#else
d47053eb
RGS
2128 PERL_UNUSED_ARG(depth);
2129#endif
2130#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2131 PERL_UNUSED_ARG(flags);
2132 PERL_UNUSED_ARG(val);
07be1b83 2133#endif
07be1b83
YO
2134 DEBUG_PEEP("join",scan,depth);
2135
2136 /* Skip NOTHING, merge EXACT*. */
2137 while (n &&
2138 ( PL_regkind[OP(n)] == NOTHING ||
2139 (stringok && (OP(n) == OP(scan))))
2140 && NEXT_OFF(n)
2141 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2142
2143 if (OP(n) == TAIL || n > next)
2144 stringok = 0;
2145 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2146 DEBUG_PEEP("skip:",n,depth);
2147 NEXT_OFF(scan) += NEXT_OFF(n);
2148 next = n + NODE_STEP_REGNODE;
2149#ifdef DEBUGGING
2150 if (stringok)
2151 stop = n;
2152#endif
2153 n = regnext(n);
2154 }
2155 else if (stringok) {
786e8c11 2156 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2157 regnode * const nnext = regnext(n);
2158
2159 DEBUG_PEEP("merg",n,depth);
2160
2161 merged++;
2162 if (oldl + STR_LEN(n) > U8_MAX)
2163 break;
2164 NEXT_OFF(scan) += NEXT_OFF(n);
2165 STR_LEN(scan) += STR_LEN(n);
2166 next = n + NODE_SZ_STR(n);
2167 /* Now we can overwrite *n : */
2168 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2169#ifdef DEBUGGING
2170 stop = next - 1;
2171#endif
2172 n = nnext;
2173 if (stopnow) break;
2174 }
2175
d47053eb
RGS
2176#ifdef EXPERIMENTAL_INPLACESCAN
2177 if (flags && !NEXT_OFF(n)) {
2178 DEBUG_PEEP("atch", val, depth);
2179 if (reg_off_by_arg[OP(n)]) {
2180 ARG_SET(n, val - n);
2181 }
2182 else {
2183 NEXT_OFF(n) = val - n;
2184 }
2185 stopnow = 1;
2186 }
07be1b83
YO
2187#endif
2188 }
2189
2190 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2191 /*
2192 Two problematic code points in Unicode casefolding of EXACT nodes:
2193
2194 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2195 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2196
2197 which casefold to
2198
2199 Unicode UTF-8
2200
2201 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2202 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2203
2204 This means that in case-insensitive matching (or "loose matching",
2205 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2206 length of the above casefolded versions) can match a target string
2207 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2208 This would rather mess up the minimum length computation.
2209
2210 What we'll do is to look for the tail four bytes, and then peek
2211 at the preceding two bytes to see whether we need to decrease
2212 the minimum length by four (six minus two).
2213
2214 Thanks to the design of UTF-8, there cannot be false matches:
2215 A sequence of valid UTF-8 bytes cannot be a subsequence of
2216 another valid sequence of UTF-8 bytes.
2217
2218 */
2219 char * const s0 = STRING(scan), *s, *t;
2220 char * const s1 = s0 + STR_LEN(scan) - 1;
2221 char * const s2 = s1 - 4;
e294cc5d
JH
2222#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2223 const char t0[] = "\xaf\x49\xaf\x42";
2224#else
07be1b83 2225 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2226#endif
07be1b83
YO
2227 const char * const t1 = t0 + 3;
2228
2229 for (s = s0 + 2;
2230 s < s2 && (t = ninstr(s, s1, t0, t1));
2231 s = t + 4) {
e294cc5d
JH
2232#ifdef EBCDIC
2233 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2234 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2235#else
07be1b83
YO
2236 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2237 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2238#endif
07be1b83
YO
2239 *min -= 4;
2240 }
2241 }
2242
2243#ifdef DEBUGGING
2244 /* Allow dumping */
2245 n = scan + NODE_SZ_STR(scan);
2246 while (n <= stop) {
2247 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2248 OP(n) = OPTIMIZED;
2249 NEXT_OFF(n) = 0;
2250 }
2251 n++;
2252 }
2253#endif
2254 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2255 return stopnow;
2256}
2257
653099ff
GS
2258/* REx optimizer. Converts nodes into quickier variants "in place".
2259 Finds fixed substrings. */
2260
a0288114 2261/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2262 to the position after last scanned or to NULL. */
2263
07be1b83
YO
2264
2265
76e3520e 2266STATIC I32
1de06328
YO
2267S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2268 I32 *minlenp, I32 *deltap,
9a957fbc 2269 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2270 /* scanp: Start here (read-write). */
2271 /* deltap: Write maxlen-minlen here. */
2272 /* last: Stop before this one. */
2273{
97aff369 2274 dVAR;
c277df42
IZ
2275 I32 min = 0, pars = 0, code;
2276 regnode *scan = *scanp, *next;
2277 I32 delta = 0;
2278 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2279 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2280 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2281 scan_data_t data_fake;
653099ff 2282 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2283 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2284 regnode *first_non_open = scan;
2285
a3621e74
YO
2286
2287 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2288#ifdef DEBUGGING
2289 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2290#endif
786e8c11
YO
2291 if ( depth == 0 ) {
2292 while (first_non_open && OP(first_non_open) == OPEN)
2293 first_non_open=regnext(first_non_open);
2294 }
2295
b81d288d 2296
c277df42
IZ
2297 while (scan && OP(scan) != END && scan < last) {
2298 /* Peephole optimizer: */
1de06328 2299 DEBUG_STUDYDATA(data,depth);
07be1b83 2300 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2301 JOIN_EXACT(scan,&min,0);
a3621e74 2302
653099ff
GS
2303 /* Follow the next-chain of the current node and optimize
2304 away all the NOTHINGs from it. */
c277df42 2305 if (OP(scan) != CURLYX) {
a3b680e6 2306 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2307 ? I32_MAX
2308 /* I32 may be smaller than U16 on CRAYs! */
2309 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2310 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2311 int noff;
2312 regnode *n = scan;
b81d288d 2313
c277df42
IZ
2314 /* Skip NOTHING and LONGJMP. */
2315 while ((n = regnext(n))
3dab1dad 2316 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2317 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2318 && off + noff < max)
2319 off += noff;
2320 if (reg_off_by_arg[OP(scan)])
2321 ARG(scan) = off;
b81d288d 2322 else
c277df42
IZ
2323 NEXT_OFF(scan) = off;
2324 }
a3621e74 2325
07be1b83 2326
3dab1dad 2327
653099ff
GS
2328 /* The principal pseudo-switch. Cannot be a switch, since we
2329 look into several different things. */
b81d288d 2330 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2331 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2332 next = regnext(scan);
2333 code = OP(scan);
a3621e74 2334 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2335
2336 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2337 /* NOTE - There is similar code to this block below for handling
2338 TRIE nodes on a re-study. If you change stuff here check there
2339 too. */
c277df42 2340 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2341 struct regnode_charclass_class accum;
d4c19fe8 2342 regnode * const startbranch=scan;
c277df42 2343
653099ff 2344 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2345 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2346 if (flags & SCF_DO_STCLASS)
830247a4 2347 cl_init_zero(pRExC_state, &accum);
a3621e74 2348
c277df42 2349 while (OP(scan) == code) {
830247a4 2350 I32 deltanext, minnext, f = 0, fake;
653099ff 2351 struct regnode_charclass_class this_class;
c277df42
IZ
2352
2353 num++;
2354 data_fake.flags = 0;
b81d288d 2355 if (data) {
2c2d71f5 2356 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2357 data_fake.last_closep = data->last_closep;
2358 }
2359 else
2360 data_fake.last_closep = &fake;
c277df42
IZ
2361 next = regnext(scan);
2362 scan = NEXTOPER(scan);
2363 if (code != BRANCH)
2364 scan = NEXTOPER(scan);
653099ff 2365 if (flags & SCF_DO_STCLASS) {
830247a4 2366 cl_init(pRExC_state, &this_class);
653099ff
GS
2367 data_fake.start_class = &this_class;
2368 f = SCF_DO_STCLASS_AND;
b81d288d 2369 }
e1901655
IZ
2370 if (flags & SCF_WHILEM_VISITED_POS)
2371 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2372
653099ff 2373 /* we suppose the run is continuous, last=next...*/
1de06328 2374 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2375 next, &data_fake, f,depth+1);
b81d288d 2376 if (min1 > minnext)
c277df42
IZ
2377 min1 = minnext;
2378 if (max1 < minnext + deltanext)
2379 max1 = minnext + deltanext;
2380 if (deltanext == I32_MAX)
aca2d497 2381 is_inf = is_inf_internal = 1;
c277df42
IZ
2382 scan = next;
2383 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2384 pars++;
3dab1dad
YO
2385 if (data) {
2386 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2387 data->flags |= SF_HAS_EVAL;
2c2d71f5 2388 data->whilem_c = data_fake.whilem_c;
3dab1dad 2389 }
653099ff 2390 if (flags & SCF_DO_STCLASS)
830247a4 2391 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2392 if (code == SUSPEND)
c277df42
IZ
2393 break;
2394 }
2395 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2396 min1 = 0;
2397 if (flags & SCF_DO_SUBSTR) {
2398 data->pos_min += min1;
2399 data->pos_delta += max1 - min1;
2400 if (max1 != min1 || is_inf)
2401 data->longest = &(data->longest_float);
2402 }
2403 min += min1;
2404 delta += max1 - min1;
653099ff 2405 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2406 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2407 if (min1) {
2408 cl_and(data->start_class, &and_with);
2409 flags &= ~SCF_DO_STCLASS;
2410 }
2411 }
2412 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2413 if (min1) {
2414 cl_and(data->start_class, &accum);
653099ff 2415 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2416 }
2417 else {
b81d288d 2418 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2419 * data->start_class */
2420 StructCopy(data->start_class, &and_with,
2421 struct regnode_charclass_class);
2422 flags &= ~SCF_DO_STCLASS_AND;
2423 StructCopy(&accum, data->start_class,
2424 struct regnode_charclass_class);
2425 flags |= SCF_DO_STCLASS_OR;
2426 data->start_class->flags |= ANYOF_EOS;
2427 }
653099ff 2428 }
a3621e74 2429
786e8c11 2430 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2431 /* demq.
2432
2433 Assuming this was/is a branch we are dealing with: 'scan' now
2434 points at the item that follows the branch sequence, whatever
2435 it is. We now start at the beginning of the sequence and look
2436 for subsequences of
2437
786e8c11
YO
2438 BRANCH->EXACT=>x1
2439 BRANCH->EXACT=>x2
2440 tail
a3621e74
YO
2441
2442 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2443
2444 If we can find such a subseqence we need to turn the first
2445 element into a trie and then add the subsequent branch exact
2446 strings to the trie.
2447
2448 We have two cases
2449
786e8c11 2450 1. patterns where the whole set of branch can be converted.
a3621e74 2451
786e8c11 2452 2. patterns where only a subset can be converted.
a3621e74
YO
2453
2454 In case 1 we can replace the whole set with a single regop
2455 for the trie. In case 2 we need to keep the start and end
2456 branchs so
2457
2458 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2459 becomes BRANCH TRIE; BRANCH X;
2460
786e8c11
YO
2461 There is an additional case, that being where there is a
2462 common prefix, which gets split out into an EXACT like node
2463 preceding the TRIE node.
2464
2465 If x(1..n)==tail then we can do a simple trie, if not we make
2466 a "jump" trie, such that when we match the appropriate word
2467 we "jump" to the appopriate tail node. Essentailly we turn
2468 a nested if into a case structure of sorts.
a3621e74
YO
2469
2470 */
786e8c11 2471
3dab1dad 2472 int made=0;
0111c4fd
RGS
2473 if (!re_trie_maxbuff) {
2474 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2475 if (!SvIOK(re_trie_maxbuff))
2476 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2477 }
786e8c11 2478 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2479 regnode *cur;
2480 regnode *first = (regnode *)NULL;
2481 regnode *last = (regnode *)NULL;
2482 regnode *tail = scan;
2483 U8 optype = 0;
2484 U32 count=0;
2485
2486#ifdef DEBUGGING
c445ea15 2487 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2488#endif
2489 /* var tail is used because there may be a TAIL
2490 regop in the way. Ie, the exacts will point to the
2491 thing following the TAIL, but the last branch will
2492 point at the TAIL. So we advance tail. If we
2493 have nested (?:) we may have to move through several
2494 tails.
2495 */
2496
2497 while ( OP( tail ) == TAIL ) {
2498 /* this is the TAIL generated by (?:) */
2499 tail = regnext( tail );
2500 }
2501
3dab1dad 2502
a3621e74 2503 DEBUG_OPTIMISE_r({
32fc9b6a 2504 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2505 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2506 (int)depth * 2 + 2, "",
2507 "Looking for TRIE'able sequences. Tail node is: ",
2508 SvPV_nolen_const( mysv )
a3621e74
YO
2509 );
2510 });
3dab1dad 2511
a3621e74
YO
2512 /*
2513
2514 step through the branches, cur represents each
2515 branch, noper is the first thing to be matched
2516 as part of that branch and noper_next is the
2517 regnext() of that node. if noper is an EXACT
2518 and noper_next is the same as scan (our current
2519 position in the regex) then the EXACT branch is
2520 a possible optimization target. Once we have
2521 two or more consequetive such branches we can
2522 create a trie of the EXACT's contents and stich
2523 it in place. If the sequence represents all of
2524 the branches we eliminate the whole thing and
2525 replace it with a single TRIE. If it is a
2526 subsequence then we need to stitch it in. This
2527 means the first branch has to remain, and needs
2528 to be repointed at the item on the branch chain
2529 following the last branch optimized. This could
2530 be either a BRANCH, in which case the
2531 subsequence is internal, or it could be the
2532 item following the branch sequence in which
2533 case the subsequence is at the end.
2534
2535 */
2536
2537 /* dont use tail as the end marker for this traverse */
2538 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2539 regnode * const noper = NEXTOPER( cur );
be981c67 2540#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2541 regnode * const noper_next = regnext( noper );
be981c67 2542#endif
a3621e74 2543
a3621e74 2544 DEBUG_OPTIMISE_r({
32fc9b6a 2545 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2546 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2547 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2548
32fc9b6a 2549 regprop(RExC_rx, mysv, noper);
a3621e74 2550 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2551 SvPV_nolen_const(mysv));
a3621e74
YO
2552
2553 if ( noper_next ) {
32fc9b6a 2554 regprop(RExC_rx, mysv, noper_next );
a3621e74 2555 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2556 SvPV_nolen_const(mysv));
a3621e74 2557 }
3dab1dad
YO
2558 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2559 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2560 });
3dab1dad
YO
2561 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2562 : PL_regkind[ OP( noper ) ] == EXACT )
2563 || OP(noper) == NOTHING )
786e8c11
YO
2564#ifdef NOJUMPTRIE
2565 && noper_next == tail
2566#endif
2567 && count < U16_MAX)
a3621e74
YO
2568 {
2569 count++;
3dab1dad
YO
2570 if ( !first || optype == NOTHING ) {
2571 if (!first) first = cur;
a3621e74
YO
2572 optype = OP( noper );
2573 } else {
a3621e74 2574 last = cur;
a3621e74
YO
2575 }
2576 } else {
2577 if ( last ) {
786e8c11
YO
2578 make_trie( pRExC_state,
2579 startbranch, first, cur, tail, count,
2580 optype, depth+1 );
a3621e74 2581 }
3dab1dad 2582 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2583#ifdef NOJUMPTRIE
2584 && noper_next == tail
2585#endif
2586 ){
a3621e74
YO
2587 count = 1;
2588 first = cur;
2589 optype = OP( noper );
2590 } else {
2591 count = 0;
2592 first = NULL;
2593 optype = 0;
2594 }
2595 last = NULL;
2596 }
2597 }
2598 DEBUG_OPTIMISE_r({
32fc9b6a 2599 regprop(RExC_rx, mysv, cur);
a3621e74 2600 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2601 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2602 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2603
2604 });
2605 if ( last ) {
786e8c11 2606 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2607#ifdef TRIE_STUDY_OPT
786e8c11
YO
2608 if ( ((made == MADE_EXACT_TRIE &&
2609 startbranch == first)
2610 || ( first_non_open == first )) &&
2611 depth==0 )
2612 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2613#endif
07be1b83 2614 }
a3621e74 2615 }
3dab1dad
YO
2616
2617 } /* do trie */
786e8c11 2618
a0ed51b3 2619 }
a3621e74 2620 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2621 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2622 } else /* single branch is optimized. */
c277df42
IZ
2623 scan = NEXTOPER(scan);
2624 continue;
a0ed51b3
LW
2625 }
2626 else if (OP(scan) == EXACT) {
cd439c50 2627 I32 l = STR_LEN(scan);
c445ea15 2628 UV uc;
a0ed51b3 2629 if (UTF) {
a3b680e6 2630 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2631 l = utf8_length(s, s + l);
9041c2e3 2632 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2633 } else {
2634 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2635 }
2636 min += l;
c277df42 2637 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2638 /* The code below prefers earlier match for fixed
2639 offset, later match for variable offset. */
2640 if (data->last_end == -1) { /* Update the start info. */
2641 data->last_start_min = data->pos_min;
2642 data->last_start_max = is_inf
b81d288d 2643 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2644 }
cd439c50 2645 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2646 if (UTF)
2647 SvUTF8_on(data->last_found);
0eda9292 2648 {
9a957fbc 2649 SV * const sv = data->last_found;
a28509cc 2650 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2651 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2652 if (mg && mg->mg_len >= 0)
5e43f467
JH
2653 mg->mg_len += utf8_length((U8*)STRING(scan),
2654 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2655 }
c277df42
IZ
2656 data->last_end = data->pos_min + l;
2657 data->pos_min += l; /* As in the first entry. */
2658 data->flags &= ~SF_BEFORE_EOL;
2659 }
653099ff
GS
2660 if (flags & SCF_DO_STCLASS_AND) {
2661 /* Check whether it is compatible with what we know already! */
2662 int compat = 1;
2663
1aa99e6b 2664 if (uc >= 0x100 ||
516a5887 2665 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2666 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2667 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2668 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2669 )
653099ff
GS
2670 compat = 0;
2671 ANYOF_CLASS_ZERO(data->start_class);
2672 ANYOF_BITMAP_ZERO(data->start_class);
2673 if (compat)
1aa99e6b 2674 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2675 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2676 if (uc < 0x100)
2677 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2678 }
2679 else if (flags & SCF_DO_STCLASS_OR) {
2680 /* false positive possible if the class is case-folded */
1aa99e6b 2681 if (uc < 0x100)
9b877dbb
IH
2682 ANYOF_BITMAP_SET(data->start_class, uc);
2683 else
2684 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2685 data->start_class->flags &= ~ANYOF_EOS;
2686 cl_and(data->start_class, &and_with);
2687 }
2688 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2689 }
3dab1dad 2690 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2691 I32 l = STR_LEN(scan);
1aa99e6b 2692 UV uc = *((U8*)STRING(scan));
653099ff
GS
2693
2694 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2695 if (flags & SCF_DO_SUBSTR) {
2696 assert(data);
1de06328 2697 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2698 }
a0ed51b3 2699 if (UTF) {
6136c704 2700 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2701 l = utf8_length(s, s + l);
9041c2e3 2702 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2703 }
2704 min += l;
ecaa9b9c 2705 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2706 data->pos_min += l;
653099ff
GS
2707 if (flags & SCF_DO_STCLASS_AND) {
2708 /* Check whether it is compatible with what we know already! */
2709 int compat = 1;
2710
1aa99e6b 2711 if (uc >= 0x100 ||
516a5887 2712 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2713 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2714 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2715 compat = 0;
2716 ANYOF_CLASS_ZERO(data->start_class);
2717 ANYOF_BITMAP_ZERO(data->start_class);
2718 if (compat) {
1aa99e6b 2719 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2720 data->start_class->flags &= ~ANYOF_EOS;
2721 data->start_class->flags |= ANYOF_FOLD;
2722 if (OP(scan) == EXACTFL)
2723 data->start_class->flags |= ANYOF_LOCALE;
2724 }
2725 }
2726 else if (flags & SCF_DO_STCLASS_OR) {
2727 if (data->start_class->flags & ANYOF_FOLD) {
2728 /* false positive possible if the class is case-folded.
2729 Assume that the locale settings are the same... */
1aa99e6b
IH
2730 if (uc < 0x100)
2731 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2732 data->start_class->flags &= ~ANYOF_EOS;
2733 }
2734 cl_and(data->start_class, &and_with);
2735 }
2736 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2737 }
bfed75c6 2738 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2739 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2740 I32 f = flags, pos_before = 0;
d4c19fe8 2741 regnode * const oscan = scan;
653099ff
GS
2742 struct regnode_charclass_class this_class;
2743 struct regnode_charclass_class *oclass = NULL;
727f22e3 2744 I32 next_is_eval = 0;
653099ff 2745
3dab1dad 2746 switch (PL_regkind[OP(scan)]) {
653099ff 2747 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2748 scan = NEXTOPER(scan);
2749 goto finish;
2750 case PLUS:
653099ff 2751 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2752 next = NEXTOPER(scan);
653099ff 2753 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2754 mincount = 1;
2755 maxcount = REG_INFTY;
c277df42
IZ
2756 next = regnext(scan);
2757 scan = NEXTOPER(scan);
2758 goto do_curly;
2759 }
2760 }
2761 if (flags & SCF_DO_SUBSTR)
2762 data->pos_min++;
2763 min++;
2764 /* Fall through. */
2765 case STAR:
653099ff
GS
2766 if (flags & SCF_DO_STCLASS) {
2767 mincount = 0;
b81d288d 2768 maxcount = REG_INFTY;
653099ff
GS
2769 next = regnext(scan);
2770 scan = NEXTOPER(scan);
2771 goto do_curly;
2772 }
b81d288d 2773 is_inf = is_inf_internal = 1;
c277df42
IZ
2774 scan = regnext(scan);
2775 if (flags & SCF_DO_SUBSTR) {
1de06328 2776 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2777 data->longest = &(data->longest_float);
2778 }
2779 goto optimize_curly_tail;
2780 case CURLY:
b81d288d 2781 mincount = ARG1(scan);
c277df42
IZ
2782 maxcount = ARG2(scan);
2783 next = regnext(scan);
cb434fcc
IZ
2784 if (OP(scan) == CURLYX) {
2785 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2786 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2787 }
c277df42 2788 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2789 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2790 do_curly:
2791 if (flags & SCF_DO_SUBSTR) {
1de06328 2792 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2793 pos_before = data->pos_min;
2794 }
2795 if (data) {
2796 fl = data->flags;
2797 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2798 if (is_inf)
2799 data->flags |= SF_IS_INF;
2800 }
653099ff 2801 if (flags & SCF_DO_STCLASS) {
830247a4 2802 cl_init(pRExC_state, &this_class);
653099ff
GS
2803 oclass = data->start_class;
2804 data->start_class = &this_class;
2805 f |= SCF_DO_STCLASS_AND;
2806 f &= ~SCF_DO_STCLASS_OR;
2807 }
e1901655
IZ
2808 /* These are the cases when once a subexpression
2809 fails at a particular position, it cannot succeed
2810 even after backtracking at the enclosing scope.
b81d288d 2811
e1901655
IZ
2812 XXXX what if minimal match and we are at the
2813 initial run of {n,m}? */
2814 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2815 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2816
c277df42 2817 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2818 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2819 (mincount == 0
2820 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2821
2822 if (flags & SCF_DO_STCLASS)
2823 data->start_class = oclass;
2824 if (mincount == 0 || minnext == 0) {
2825 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2826 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2827 }
2828 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2829 /* Switch to OR mode: cache the old value of
653099ff
GS
2830 * data->start_class */
2831 StructCopy(data->start_class, &and_with,
2832 struct regnode_charclass_class);
2833 flags &= ~SCF_DO_STCLASS_AND;
2834 StructCopy(&this_class, data->start_class,
2835 struct regnode_charclass_class);
2836 flags |= SCF_DO_STCLASS_OR;
2837 data->start_class->flags |= ANYOF_EOS;
2838 }
2839 } else { /* Non-zero len */
2840 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2841 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2842 cl_and(data->start_class, &and_with);
2843 }
2844 else if (flags & SCF_DO_STCLASS_AND)
2845 cl_and(data->start_class, &this_class);
2846 flags &= ~SCF_DO_STCLASS;
2847 }
c277df42
IZ
2848 if (!scan) /* It was not CURLYX, but CURLY. */
2849 scan = next;
041457d9
DM
2850 if ( /* ? quantifier ok, except for (?{ ... }) */
2851 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2852 && (minnext == 0) && (deltanext == 0)
99799961 2853 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2854 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2855 && ckWARN(WARN_REGEXP))
b45f050a 2856 {
830247a4 2857 vWARN(RExC_parse,
b45f050a
JF
2858 "Quantifier unexpected on zero-length expression");
2859 }
2860
c277df42 2861 min += minnext * mincount;
b81d288d 2862 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2863 && (minnext + deltanext) > 0)
2864 || deltanext == I32_MAX);
aca2d497 2865 is_inf |= is_inf_internal;
c277df42
IZ
2866 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2867
2868 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2869 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2870 && data->flags & SF_IN_PAR
2871 && !(data->flags & SF_HAS_EVAL)
2872 && !deltanext && minnext == 1 ) {
2873 /* Try to optimize to CURLYN. */
2874 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2875 regnode * const nxt1 = nxt;
497b47a8
JH
2876#ifdef DEBUGGING
2877 regnode *nxt2;
2878#endif
c277df42
IZ
2879
2880 /* Skip open. */
2881 nxt = regnext(nxt);
bfed75c6 2882 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2883 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2884 && STR_LEN(nxt) == 1))
c277df42 2885 goto nogo;
497b47a8 2886#ifdef DEBUGGING
c277df42 2887 nxt2 = nxt;
497b47a8 2888#endif
c277df42 2889 nxt = regnext(nxt);
b81d288d 2890 if (OP(nxt) != CLOSE)
c277df42
IZ
2891 goto nogo;
2892 /* Now we know that nxt2 is the only contents: */
eb160463 2893 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2894 OP(oscan) = CURLYN;
2895 OP(nxt1) = NOTHING; /* was OPEN. */
2896#ifdef DEBUGGING
2897 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2898 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2899 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2900 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2901 OP(nxt + 1) = OPTIMIZED; /* was count. */
2902 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2903#endif
c277df42 2904 }
c277df42
IZ
2905 nogo:
2906
2907 /* Try optimization CURLYX => CURLYM. */
b81d288d 2908 if ( OP(oscan) == CURLYX && data
c277df42 2909 && !(data->flags & SF_HAS_PAR)
c277df42 2910 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2911 && !deltanext /* atom is fixed width */
2912 && minnext != 0 /* CURLYM can't handle zero width */
2913 ) {
c277df42
IZ
2914 /* XXXX How to optimize if data == 0? */
2915 /* Optimize to a simpler form. */
2916 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2917 regnode *nxt2;
2918
2919 OP(oscan) = CURLYM;
2920 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2921 && (OP(nxt2) != WHILEM))
c277df42
IZ
2922 nxt = nxt2;
2923 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2924 /* Need to optimize away parenths. */
2925 if (data->flags & SF_IN_PAR) {
2926 /* Set the parenth number. */
2927 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2928
b81d288d 2929 if (OP(nxt) != CLOSE)
b45f050a 2930 FAIL("Panic opt close");
eb160463 2931 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2932 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2933 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2934#ifdef DEBUGGING
2935 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2936 OP(nxt + 1) = OPTIMIZED; /* was count. */
2937 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2938 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2939#endif
c277df42
IZ
2940#if 0
2941 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2942 regnode *nnxt = regnext(nxt1);
b81d288d 2943
c277df42
IZ
2944 if (nnxt == nxt) {
2945 if (reg_off_by_arg[OP(nxt1)])
2946 ARG_SET(nxt1, nxt2 - nxt1);
2947 else if (nxt2 - nxt1 < U16_MAX)
2948 NEXT_OFF(nxt1) = nxt2 - nxt1;
2949 else
2950 OP(nxt) = NOTHING; /* Cannot beautify */
2951 }
2952 nxt1 = nnxt;
2953 }
2954#endif
2955 /* Optimize again: */
1de06328 2956 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2957 NULL, 0,depth+1);
a0ed51b3
LW
2958 }
2959 else
c277df42 2960 oscan->flags = 0;
c277df42 2961 }
e1901655
IZ
2962 else if ((OP(oscan) == CURLYX)
2963 && (flags & SCF_WHILEM_VISITED_POS)
2964 /* See the comment on a similar expression above.
2965 However, this time it not a subexpression
2966 we care about, but the expression itself. */
2967 && (maxcount == REG_INFTY)
2968 && data && ++data->whilem_c < 16) {
2969 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2970 /* Find WHILEM (as in regexec.c) */
2971 regnode *nxt = oscan + NEXT_OFF(oscan);
2972
2973 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2974 nxt += ARG(nxt);
eb160463
GS
2975 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2976 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2977 }
b81d288d 2978 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2979 pars++;
2980 if (flags & SCF_DO_SUBSTR) {
c445ea15 2981 SV *last_str = NULL;
c277df42
IZ
2982 int counted = mincount != 0;
2983
2984 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2985#if defined(SPARC64_GCC_WORKAROUND)
2986 I32 b = 0;
2987 STRLEN l = 0;
cfd0369c 2988 const char *s = NULL;
5d1c421c
JH
2989 I32 old = 0;
2990
2991 if (pos_before >= data->last_start_min)
2992 b = pos_before;
2993 else
2994 b = data->last_start_min;
2995
2996 l = 0;
cfd0369c 2997 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2998 old = b - data->last_start_min;
2999
3000#else
b81d288d 3001 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3002 ? pos_before : data->last_start_min;
3003 STRLEN l;
d4c19fe8 3004 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3005 I32 old = b - data->last_start_min;
5d1c421c 3006#endif
a0ed51b3
LW
3007
3008 if (UTF)
3009 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3010
a0ed51b3 3011 l -= old;
c277df42 3012 /* Get the added string: */
79cb57f6 3013 last_str = newSVpvn(s + old, l);
0e933229
IH
3014 if (UTF)
3015 SvUTF8_on(last_str);
c277df42
IZ
3016 if (deltanext == 0 && pos_before == b) {
3017 /* What was added is a constant string */
3018 if (mincount > 1) {
3019 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3020 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3021 SvPVX_const(last_str), l, mincount - 1);
b162af07 3022 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3023 /* Add additional parts. */
b81d288d 3024 SvCUR_set(data->last_found,
c277df42
IZ
3025 SvCUR(data->last_found) - l);
3026 sv_catsv(data->last_found, last_str);
0eda9292
JH
3027 {
3028 SV * sv = data->last_found;
3029 MAGIC *mg =
3030 SvUTF8(sv) && SvMAGICAL(sv) ?
3031 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3032 if (mg && mg->mg_len >= 0)
3033 mg->mg_len += CHR_SVLEN(last_str);
3034 }
c277df42
IZ
3035 data->last_end += l * (mincount - 1);
3036 }
2a8d9689
HS
3037 } else {
3038 /* start offset must point into the last copy */
3039 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3040 data->last_start_max += is_inf ? I32_MAX
3041 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3042 }
3043 }
3044 /* It is counted once already... */
3045 data->pos_min += minnext * (mincount - counted);
3046 data->pos_delta += - counted * deltanext +
3047 (minnext + deltanext) * maxcount - minnext * mincount;
3048 if (mincount != maxcount) {
653099ff
GS
3049 /* Cannot extend fixed substrings found inside
3050 the group. */
1de06328 3051 scan_commit(pRExC_state,data,minlenp);
c277df42 3052 if (mincount && last_str) {
d4c19fe8
AL
3053 SV * const sv = data->last_found;
3054 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3055 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3056
3057 if (mg)
3058 mg->mg_len = -1;
3059 sv_setsv(sv, last_str);
c277df42 3060 data->last_end = data->pos_min;
b81d288d 3061 data->last_start_min =
a0ed51b3 3062 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3063 data->last_start_max = is_inf
3064 ? I32_MAX
c277df42 3065 : data->pos_min + data->pos_delta
a0ed51b3 3066 - CHR_SVLEN(last_str);
c277df42
IZ
3067 }
3068 data->longest = &(data->longest_float);
3069 }
aca2d497 3070 SvREFCNT_dec(last_str);
c277df42 3071 }
405ff068 3072 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3073 data->flags |= SF_HAS_EVAL;
3074 optimize_curly_tail:
c277df42 3075 if (OP(oscan) != CURLYX) {
3dab1dad 3076 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3077 && NEXT_OFF(next))
3078 NEXT_OFF(oscan) += NEXT_OFF(next);
3079 }
c277df42 3080 continue;
653099ff 3081 default: /* REF and CLUMP only? */
c277df42 3082 if (flags & SCF_DO_SUBSTR) {
1de06328 3083 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3084 data->longest = &(data->longest_float);
3085 }
aca2d497 3086 is_inf = is_inf_internal = 1;
653099ff 3087 if (flags & SCF_DO_STCLASS_OR)
830247a4 3088 cl_anything(pRExC_state, data->start_class);
653099ff 3089 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3090 break;
3091 }
a0ed51b3 3092 }
bfed75c6 3093 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3094 int value = 0;
653099ff 3095
c277df42 3096 if (flags & SCF_DO_SUBSTR) {
1de06328 3097 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3098 data->pos_min++;
3099 }
3100 min++;
653099ff
GS
3101 if (flags & SCF_DO_STCLASS) {
3102 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3103
3104 /* Some of the logic below assumes that switching
3105 locale on will only add false positives. */
3dab1dad 3106 switch (PL_regkind[OP(scan)]) {
653099ff 3107 case SANY:
653099ff
GS
3108 default:
3109 do_default:
3110 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3111 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3112 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3113 break;
3114 case REG_ANY:
3115 if (OP(scan) == SANY)
3116 goto do_default;
3117 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3118 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3119 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3120 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3121 }
3122 if (flags & SCF_DO_STCLASS_AND || !value)
3123 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3124 break;
3125 case ANYOF:
3126 if (flags & SCF_DO_STCLASS_AND)
3127 cl_and(data->start_class,
3128 (struct regnode_charclass_class*)scan);
3129 else
830247a4 3130 cl_or(pRExC_state, data->start_class,
653099ff
GS
3131 (struct regnode_charclass_class*)scan);
3132 break;
3133 case ALNUM:
3134 if (flags & SCF_DO_STCLASS_AND) {
3135 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3136 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3137 for (value = 0; value < 256; value++)
3138 if (!isALNUM(value))
3139 ANYOF_BITMAP_CLEAR(data->start_class, value);
3140 }
3141 }
3142 else {
3143 if (data->start_class->flags & ANYOF_LOCALE)
3144 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3145 else {
3146 for (value = 0; value < 256; value++)
3147 if (isALNUM(value))
b81d288d 3148 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3149 }
3150 }
3151 break;
3152 case ALNUML:
3153 if (flags & SCF_DO_STCLASS_AND) {
3154 if (data->start_class->flags & ANYOF_LOCALE)
3155 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3156 }
3157 else {
3158 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3159 data->start_class->flags |= ANYOF_LOCALE;
3160 }
3161 break;
3162 case NALNUM:
3163 if (flags & SCF_DO_STCLASS_AND) {
3164 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3165 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3166 for (value = 0; value < 256; value++)
3167 if (isALNUM(value))
3168 ANYOF_BITMAP_CLEAR(data->start_class, value);
3169 }
3170 }
3171 else {
3172 if (data->start_class->flags & ANYOF_LOCALE)
3173 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3174 else {
3175 for (value = 0; value < 256; value++)
3176 if (!isALNUM(value))
b81d288d 3177 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3178 }
3179 }
3180 break;
3181 case NALNUML:
3182 if (flags & SCF_DO_STCLASS_AND) {
3183 if (data->start_class->flags & ANYOF_LOCALE)
3184 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3185 }
3186 else {
3187 data->start_class->flags |= ANYOF_LOCALE;
3188 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3189 }
3190 break;
3191 case SPACE:
3192 if (flags & SCF_DO_STCLASS_AND) {
3193 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3194 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3195 for (value = 0; value < 256; value++)
3196 if (!isSPACE(value))
3197 ANYOF_BITMAP_CLEAR(data->start_class, value);
3198 }
3199 }
3200 else {
3201 if (data->start_class->flags & ANYOF_LOCALE)
3202 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3203 else {
3204 for (value = 0; value < 256; value++)
3205 if (isSPACE(value))
b81d288d 3206 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3207 }
3208 }
3209 break;
3210 case SPACEL:
3211 if (flags & SCF_DO_STCLASS_AND) {
3212 if (data->start_class->flags & ANYOF_LOCALE)
3213 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3214 }
3215 else {
3216 data->start_class->flags |= ANYOF_LOCALE;
3217 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3218 }
3219 break;
3220 case NSPACE:
3221 if (flags & SCF_DO_STCLASS_AND) {
3222 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3223 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3224 for (value = 0; value < 256; value++)
3225 if (isSPACE(value))
3226 ANYOF_BITMAP_CLEAR(data->start_class, value);
3227 }
3228 }
3229 else {
3230 if (data->start_class->flags & ANYOF_LOCALE)
3231 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3232 else {
3233 for (value = 0; value < 256; value++)
3234 if (!isSPACE(value))
b81d288d 3235 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3236 }
3237 }
3238 break;
3239 case NSPACEL:
3240 if (flags & SCF_DO_STCLASS_AND) {
3241 if (data->start_class->flags & ANYOF_LOCALE) {
3242 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3243 for (value = 0; value < 256; value++)
3244 if (!isSPACE(value))
3245 ANYOF_BITMAP_CLEAR(data->start_class, value);
3246 }
3247 }
3248 else {
3249 data->start_class->flags |= ANYOF_LOCALE;
3250 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3251 }
3252 break;
3253 case DIGIT:
3254 if (flags & SCF_DO_STCLASS_AND) {
3255 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3256 for (value = 0; value < 256; value++)
3257 if (!isDIGIT(value))
3258 ANYOF_BITMAP_CLEAR(data->start_class, value);
3259 }
3260 else {
3261 if (data->start_class->flags & ANYOF_LOCALE)
3262 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3263 else {
3264 for (value = 0; value < 256; value++)
3265 if (isDIGIT(value))
b81d288d 3266 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3267 }
3268 }
3269 break;
3270 case NDIGIT:
3271 if (flags & SCF_DO_STCLASS_AND) {
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3273 for (value = 0; value < 256; value++)
3274 if (isDIGIT(value))
3275 ANYOF_BITMAP_CLEAR(data->start_class, value);
3276 }
3277 else {
3278 if (data->start_class->flags & ANYOF_LOCALE)
3279 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3280 else {
3281 for (value = 0; value < 256; value++)
3282 if (!isDIGIT(value))
b81d288d 3283 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3284 }
3285 }
3286 break;
3287 }
3288 if (flags & SCF_DO_STCLASS_OR)
3289 cl_and(data->start_class, &and_with);
3290 flags &= ~SCF_DO_STCLASS;
3291 }
a0ed51b3 3292 }
3dab1dad 3293 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3294 data->flags |= (OP(scan) == MEOL
3295 ? SF_BEFORE_MEOL
3296 : SF_BEFORE_SEOL);
a0ed51b3 3297 }
3dab1dad 3298 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3299 /* Lookbehind, or need to calculate parens/evals/stclass: */
3300 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3301 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3302 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3303 || OP(scan) == UNLESSM )
3304 {
3305 /* Negative Lookahead/lookbehind
3306 In this case we can't do fixed string optimisation.
3307 */
3308
3309 I32 deltanext, minnext, fake = 0;
3310 regnode *nscan;
3311 struct regnode_charclass_class intrnl;
3312 int f = 0;
3313
3314 data_fake.flags = 0;
3315 if (data) {
3316 data_fake.whilem_c = data->whilem_c;
3317 data_fake.last_closep = data->last_closep;
a0ed51b3 3318 }
1de06328
YO
3319 else
3320 data_fake.last_closep = &fake;
3321 if ( flags & SCF_DO_STCLASS && !scan->flags
3322 && OP(scan) == IFMATCH ) { /* Lookahead */
3323 cl_init(pRExC_state, &intrnl);
3324 data_fake.start_class = &intrnl;
3325 f |= SCF_DO_STCLASS_AND;
c277df42 3326 }
1de06328
YO
3327 if (flags & SCF_WHILEM_VISITED_POS)
3328 f |= SCF_WHILEM_VISITED_POS;
3329 next = regnext(scan);
3330 nscan = NEXTOPER(NEXTOPER(scan));
3331 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3332 if (scan->flags) {
3333 if (deltanext) {
3334 vFAIL("Variable length lookbehind not implemented");
3335 }
3336 else if (minnext > (I32)U8_MAX) {
3337 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3338 }
3339 scan->flags = (U8)minnext;
3340 }
3341 if (data) {
3342 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3343 pars++;
3344 if (data_fake.flags & SF_HAS_EVAL)
3345 data->flags |= SF_HAS_EVAL;
3346 data->whilem_c = data_fake.whilem_c;
3347 }
3348 if (f & SCF_DO_STCLASS_AND) {
3349 const int was = (data->start_class->flags & ANYOF_EOS);
3350
3351 cl_and(data->start_class, &intrnl);
3352 if (was)
3353 data->start_class->flags |= ANYOF_EOS;
3354 }
be8e71aa 3355 }
1de06328
YO
3356#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3357 else {
3358 /* Positive Lookahead/lookbehind
3359 In this case we can do fixed string optimisation,
3360 but we must be careful about it. Note in the case of
3361 lookbehind the positions will be offset by the minimum
3362 length of the pattern, something we won't know about
3363 until after the recurse.
3364 */
3365 I32 deltanext, fake = 0;
3366 regnode *nscan;
3367 struct regnode_charclass_class intrnl;
3368 int f = 0;
3369 /* We use SAVEFREEPV so that when the full compile
3370 is finished perl will clean up the allocated
3371 minlens when its all done. This was we don't
3372 have to worry about freeing them when we know
3373 they wont be used, which would be a pain.
3374 */
3375 I32 *minnextp;
3376 Newx( minnextp, 1, I32 );
3377 SAVEFREEPV(minnextp);
3378
3379 if (data) {
3380 StructCopy(data, &data_fake, scan_data_t);
3381 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3382 f |= SCF_DO_SUBSTR;
3383 if (scan->flags)
3384 scan_commit(pRExC_state, &data_fake,minlenp);
3385 data_fake.last_found=newSVsv(data->last_found);
3386 }
3387 }
3388 else
3389 data_fake.last_closep = &fake;
3390 data_fake.flags = 0;
3391 if (is_inf)
3392 data_fake.flags |= SF_IS_INF;
3393 if ( flags & SCF_DO_STCLASS && !scan->flags
3394 && OP(scan) == IFMATCH ) { /* Lookahead */
3395 cl_init(pRExC_state, &intrnl);
3396 data_fake.start_class = &intrnl;
3397 f |= SCF_DO_STCLASS_AND;
3398 }
3399 if (flags & SCF_WHILEM_VISITED_POS)
3400 f |= SCF_WHILEM_VISITED_POS;
3401 next = regnext(scan);
3402 nscan = NEXTOPER(NEXTOPER(scan));
3403
3404 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3405 if (scan->flags) {
3406 if (deltanext) {
3407 vFAIL("Variable length lookbehind not implemented");
3408 }
3409 else if (*minnextp > (I32)U8_MAX) {
3410 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3411 }
3412 scan->flags = (U8)*minnextp;
3413 }
3414
3415 *minnextp += min;
3416
3417
3418 if (f & SCF_DO_STCLASS_AND) {
3419 const int was = (data->start_class->flags & ANYOF_EOS);
3420
3421 cl_and(data->start_class, &intrnl);
3422 if (was)
3423 data->start_class->flags |= ANYOF_EOS;
3424 }
3425 if (data) {
3426 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3427 pars++;
3428 if (data_fake.flags & SF_HAS_EVAL)
3429 data->flags |= SF_HAS_EVAL;
3430 data->whilem_c = data_fake.whilem_c;
3431 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3432 if (RExC_rx->minlen<*minnextp)
3433 RExC_rx->minlen=*minnextp;
3434 scan_commit(pRExC_state, &data_fake, minnextp);
3435 SvREFCNT_dec(data_fake.last_found);
3436
3437 if ( data_fake.minlen_fixed != minlenp )
3438 {
3439 data->offset_fixed= data_fake.offset_fixed;
3440 data->minlen_fixed= data_fake.minlen_fixed;
3441 data->lookbehind_fixed+= scan->flags;
3442 }
3443 if ( data_fake.minlen_float != minlenp )
3444 {
3445 data->minlen_float= data_fake.minlen_float;
3446 data->offset_float_min=data_fake.offset_float_min;
3447 data->offset_float_max=data_fake.offset_float_max;
3448 data->lookbehind_float+= scan->flags;
3449 }
3450 }
3451 }
3452
653099ff 3453
653099ff 3454 }
1de06328 3455#endif
a0ed51b3
LW
3456 }
3457 else if (OP(scan) == OPEN) {
c277df42 3458 pars++;
a0ed51b3 3459 }
cb434fcc 3460 else if (OP(scan) == CLOSE) {
eb160463 3461 if ((I32)ARG(scan) == is_par) {
cb434fcc 3462 next = regnext(scan);
c277df42 3463
cb434fcc
IZ
3464 if ( next && (OP(next) != WHILEM) && next < last)
3465 is_par = 0; /* Disable optimization */
3466 }
3467 if (data)
3468 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3469 }
3470 else if (OP(scan) == EVAL) {
c277df42
IZ
3471 if (data)
3472 data->flags |= SF_HAS_EVAL;
3473 }
0a4db386
YO
3474 else if ( (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3475 || OP(scan)==RECURSE) /* recursion */
3476 {
3477 if (OP(scan)==RECURSE) {
3478 ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan );
3479 }
0f5d15d6 3480 if (flags & SCF_DO_SUBSTR) {
1de06328 3481 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3482 data->longest = &(data->longest_float);
3483 }
3484 is_inf = is_inf_internal = 1;
653099ff 3485 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3486 cl_anything(pRExC_state, data->start_class);
96776eda 3487 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3488 }
786e8c11
YO
3489#ifdef TRIE_STUDY_OPT
3490#ifdef FULL_TRIE_STUDY
3491 else if (PL_regkind[OP(scan)] == TRIE) {
3492 /* NOTE - There is similar code to this block above for handling
3493 BRANCH nodes on the initial study. If you change stuff here
3494 check there too. */
7f69552c 3495 regnode *trie_node= scan;
786e8c11
YO
3496 regnode *tail= regnext(scan);
3497 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3498 I32 max1 = 0, min1 = I32_MAX;
3499 struct regnode_charclass_class accum;
3500
3501 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3502 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3503 if (flags & SCF_DO_STCLASS)
3504 cl_init_zero(pRExC_state, &accum);
3505
3506 if (!trie->jump) {
3507 min1= trie->minlen;
3508 max1= trie->maxlen;
3509 } else {
3510 const regnode *nextbranch= NULL;
3511 U32 word;
3512
3513 for ( word=1 ; word <= trie->wordcount ; word++)
3514 {
3515 I32 deltanext=0, minnext=0, f = 0, fake;
3516 struct regnode_charclass_class this_class;
3517
3518 data_fake.flags = 0;
3519 if (data) {
3520 data_fake.whilem_c = data->whilem_c;
3521 data_fake.last_closep = data->last_closep;
3522 }
3523 else
3524 data_fake.last_closep = &fake;
3525
3526 if (flags & SCF_DO_STCLASS) {
3527 cl_init(pRExC_state, &this_class);
3528 data_fake.start_class = &this_class;
3529 f = SCF_DO_STCLASS_AND;
3530 }
3531 if (flags & SCF_WHILEM_VISITED_POS)
3532 f |= SCF_WHILEM_VISITED_POS;
3533
3534 if (trie->jump[word]) {
3535 if (!nextbranch)
7f69552c
YO
3536 nextbranch = trie_node + trie->jump[0];
3537 scan= trie_node + trie->jump[word];
786e8c11
YO
3538 /* We go from the jump point to the branch that follows
3539 it. Note this means we need the vestigal unused branches
3540 even though they arent otherwise used.
3541 */
1de06328 3542 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3543 (regnode *)nextbranch, &data_fake, f,depth+1);
3544 }
3545 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3546 nextbranch= regnext((regnode*)nextbranch);
3547
3548 if (min1 > (I32)(minnext + trie->minlen))
3549 min1 = minnext + trie->minlen;
3550 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3551 max1 = minnext + deltanext + trie->maxlen;
3552 if (deltanext == I32_MAX)
3553 is_inf = is_inf_internal = 1;
3554
3555 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3556 pars++;
3557
3558 if (data) {
3559 if (data_fake.flags & SF_HAS_EVAL)
3560 data->flags |= SF_HAS_EVAL;
3561 data->whilem_c = data_fake.whilem_c;
3562 }
3563 if (flags & SCF_DO_STCLASS)
3564 cl_or(pRExC_state, &accum, &this_class);
3565 }
3566 }
3567 if (flags & SCF_DO_SUBSTR) {
3568 data->pos_min += min1;
3569 data->pos_delta += max1 - min1;
3570 if (max1 != min1 || is_inf)
3571 data->longest = &(data->longest_float);
3572 }
3573 min += min1;
3574 delta += max1 - min1;
3575 if (flags & SCF_DO_STCLASS_OR) {
3576 cl_or(pRExC_state, data->start_class, &accum);
3577 if (min1) {
3578 cl_and(data->start_class, &and_with);
3579 flags &= ~SCF_DO_STCLASS;
3580 }
3581 }
3582 else if (flags & SCF_DO_STCLASS_AND) {
3583 if (min1) {
3584 cl_and(data->start_class, &accum);
3585 flags &= ~SCF_DO_STCLASS;
3586 }
3587 else {
3588 /* Switch to OR mode: cache the old value of
3589 * data->start_class */
3590 StructCopy(data->start_class, &and_with,
3591 struct regnode_charclass_class);
3592 flags &= ~SCF_DO_STCLASS_AND;
3593 StructCopy(&accum, data->start_class,
3594 struct regnode_charclass_class);
3595 flags |= SCF_DO_STCLASS_OR;
3596 data->start_class->flags |= ANYOF_EOS;
3597 }
3598 }
3599 scan= tail;
3600 continue;
3601 }
3602#else
3603 else if (PL_regkind[OP(scan)] == TRIE) {
3604 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3605 U8*bang=NULL;
3606
3607 min += trie->minlen;
3608 delta += (trie->maxlen - trie->minlen);
3609 flags &= ~SCF_DO_STCLASS; /* xxx */
3610 if (flags & SCF_DO_SUBSTR) {
1de06328 3611 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3612 data->pos_min += trie->minlen;
3613 data->pos_delta += (trie->maxlen - trie->minlen);
3614 if (trie->maxlen != trie->minlen)
3615 data->longest = &(data->longest_float);
3616 }
3617 if (trie->jump) /* no more substrings -- for now /grr*/
3618 flags &= ~SCF_DO_SUBSTR;
3619 }
3620#endif /* old or new */
3621#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3622 /* Else: zero-length, ignore. */
3623 scan = regnext(scan);
3624 }
3625
3626 finish:
3627 *scanp = scan;
aca2d497 3628 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3629 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3630 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3631 if (is_par > (I32)U8_MAX)
c277df42
IZ
3632 is_par = 0;
3633 if (is_par && pars==1 && data) {
3634 data->flags |= SF_IN_PAR;
3635 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3636 }
3637 else if (pars && data) {
c277df42
IZ
3638 data->flags |= SF_HAS_PAR;
3639 data->flags &= ~SF_IN_PAR;
3640 }
653099ff
GS
3641 if (flags & SCF_DO_STCLASS_OR)
3642 cl_and(data->start_class, &and_with);
786e8c11
YO
3643 if (flags & SCF_TRIE_RESTUDY)
3644 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3645
3646 DEBUG_STUDYDATA(data,depth);
3647
c277df42
IZ
3648 return min;
3649}
3650
76e3520e 3651STATIC I32
5f66b61c 3652S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3653{
830247a4 3654 if (RExC_rx->data) {
2eb97020 3655 const U32 count = RExC_rx->data->count;
b81d288d 3656 Renewc(RExC_rx->data,
2eb97020 3657 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3658 char, struct reg_data);
2eb97020 3659 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3660 RExC_rx->data->count += n;
a0ed51b3
LW
3661 }
3662 else {
a02a5408 3663 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3664 char, struct reg_data);
a02a5408 3665 Newx(RExC_rx->data->what, n, U8);
830247a4 3666 RExC_rx->data->count = n;
c277df42 3667 }
830247a4
IZ
3668 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3669 return RExC_rx->data->count - n;
c277df42
IZ
3670}
3671
76234dfb 3672#ifndef PERL_IN_XSUB_RE
d88dccdf 3673void
864dbfa3 3674Perl_reginitcolors(pTHX)
d88dccdf 3675{
97aff369 3676 dVAR;
1df70142 3677 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3678 if (s) {
1df70142
AL
3679 char *t = savepv(s);
3680 int i = 0;
3681 PL_colors[0] = t;
d88dccdf 3682 while (++i < 6) {
1df70142
AL
3683 t = strchr(t, '\t');
3684 if (t) {
3685 *t = '\0';
3686 PL_colors[i] = ++t;
d88dccdf
IZ
3687 }
3688 else
1df70142 3689 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3690 }
3691 } else {
1df70142 3692 int i = 0;
b81d288d 3693 while (i < 6)
06b5626a 3694 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3695 }
3696 PL_colorset = 1;
3697}
76234dfb 3698#endif
8615cb43 3699
07be1b83 3700
786e8c11
YO
3701#ifdef TRIE_STUDY_OPT
3702#define CHECK_RESTUDY_GOTO \
3703 if ( \
3704 (data.flags & SCF_TRIE_RESTUDY) \
3705 && ! restudied++ \
3706 ) goto reStudy
3707#else
3708#define CHECK_RESTUDY_GOTO