This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Swap _reg_ac_data.trie to U32 offset into the regdata array, as
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
c74340f9 159#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 160#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 165#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
166#define RExC_open_parens (pRExC_state->open_parens)
167#define RExC_close_parens (pRExC_state->close_parens)
168#define RExC_opend (pRExC_state->opend)
81714fb9 169#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
170#define RExC_recurse (pRExC_state->recurse)
171#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 172
a687059c
LW
173#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
a687059c 176
35c8bce7
LW
177#ifdef SPSTART
178#undef SPSTART /* dratted cpp namespace... */
179#endif
a687059c
LW
180/*
181 * Flags to be passed up and down.
182 */
a687059c 183#define WORST 0 /* Worst case. */
821b33a5 184#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
185#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186#define SPSTART 0x4 /* Starts with * or +. */
187#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 188
3dab1dad
YO
189#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
190
07be1b83
YO
191/* whether trie related optimizations are enabled */
192#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193#define TRIE_STUDY_OPT
786e8c11 194#define FULL_TRIE_STUDY
07be1b83
YO
195#define TRIE_STCLASS
196#endif
1de06328
YO
197
198
40d049e4
YO
199
200#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201#define PBITVAL(paren) (1 << ((paren) & 7))
202#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205
206
1de06328
YO
207/* About scan_data_t.
208
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
215 in the pattern:
216
217 /FOO[xX]A.*B[xX]BAR/
218
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
222
223 The strings can be composites, for instance
224
225 /(f)(o)(o)/
226
227 will result in a composite fixed substring 'foo'.
228
229 For each string some basic information is maintained:
230
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
237 found.
238
239 - max_offset
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
243
244 - minlenp
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
248 involved. Consider
249
250 /(?=FOO).*F/
251
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
262
263 - lookbehind
264
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
275 associated string.
276
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
279
280*/
2c2d71f5
JH
281
282typedef struct scan_data_t {
1de06328
YO
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
2c2d71f5
JH
285 I32 pos_min;
286 I32 pos_delta;
287 SV *last_found;
1de06328 288 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
289 I32 last_start_min;
290 I32 last_start_max;
1de06328
YO
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
301 I32 flags;
302 I32 whilem_c;
cb434fcc 303 I32 *last_closep;
653099ff 304 struct regnode_charclass_class *start_class;
2c2d71f5
JH
305} scan_data_t;
306
a687059c 307/*
e50aee73 308 * Forward declarations for pregcomp()'s friends.
a687059c 309 */
a0d0e21e 310
27da23d5 311static const scan_data_t zero_scan_data =
1de06328 312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
313
314#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
315#define SF_BEFORE_SEOL 0x0001
316#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
317#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319
09b7f37c
CB
320#ifdef NO_UNARY_PLUS
321# define SF_FIX_SHIFT_EOL (0+2)
322# define SF_FL_SHIFT_EOL (0+4)
323#else
324# define SF_FIX_SHIFT_EOL (+2)
325# define SF_FL_SHIFT_EOL (+4)
326#endif
c277df42
IZ
327
328#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
330
331#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
333#define SF_IS_INF 0x0040
334#define SF_HAS_PAR 0x0080
335#define SF_IN_PAR 0x0100
336#define SF_HAS_EVAL 0x0200
337#define SCF_DO_SUBSTR 0x0400
653099ff
GS
338#define SCF_DO_STCLASS_AND 0x0800
339#define SCF_DO_STCLASS_OR 0x1000
340#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 341#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 342
786e8c11 343#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 344#define SCF_SEEN_ACCEPT 0x8000
07be1b83 345
eb160463 346#define UTF (RExC_utf8 != 0)
bbe252da
YO
347#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 349
ffc61ed2 350#define OOB_UNICODE 12345678
93733859 351#define OOB_NAMEDCLASS -1
b8c5462f 352
a0ed51b3
LW
353#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355
8615cb43 356
b45f050a
JF
357/* length of regex to show in messages that don't mark a position within */
358#define RegexLengthToShowInErrorMessages 127
359
360/*
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
364 */
7253e4e3
RK
365#define MARKER1 "<-- HERE" /* marker as it appears in the description */
366#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 367
7253e4e3 368#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
369
370/*
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 * "...".
374 */
58e23c8d 375#define _FAIL(code) STMT_START { \
bfed75c6 376 const char *ellipses = ""; \
ccb2c380
MP
377 IV len = RExC_end - RExC_precomp; \
378 \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
384 ellipses = "..."; \
385 } \
58e23c8d 386 code; \
ccb2c380 387} STMT_END
8615cb43 388
58e23c8d
YO
389#define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
392
393#define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
396
b45f050a 397/*
b45f050a
JF
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
399 */
ccb2c380 400#define Simple_vFAIL(m) STMT_START { \
a28509cc 401 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
408 */
ccb2c380
MP
409#define vFAIL(m) STMT_START { \
410 if (!SIZE_ONLY) \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
412 Simple_vFAIL(m); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Like Simple_vFAIL(), but accepts two arguments.
417 */
ccb2c380 418#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 419 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
422} STMT_END
b45f050a
JF
423
424/*
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
426 */
ccb2c380
MP
427#define vFAIL2(m,a1) STMT_START { \
428 if (!SIZE_ONLY) \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
431} STMT_END
b45f050a
JF
432
433
434/*
435 * Like Simple_vFAIL(), but accepts three arguments.
436 */
ccb2c380 437#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 438 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
445 */
ccb2c380
MP
446#define vFAIL3(m,a1,a2) STMT_START { \
447 if (!SIZE_ONLY) \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
450} STMT_END
b45f050a
JF
451
452/*
453 * Like Simple_vFAIL(), but accepts four arguments.
454 */
ccb2c380 455#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 456 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
b45f050a 460
ccb2c380 461#define vWARN(loc,m) STMT_START { \
a28509cc 462 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467#define vWARNdep(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474
475#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
9d1d55b5 498
8615cb43 499
cd439c50 500/* Allow for side effects in s */
ccb2c380
MP
501#define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503} STMT_END
cd439c50 504
fac92740
MJD
505/* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
07be1b83 509 * Position is 1 indexed.
fac92740
MJD
510 */
511
ccb2c380
MP
512#define Set_Node_Offset_To_R(node,byte) STMT_START { \
513 if (! SIZE_ONLY) { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 515 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 516 if((node) < 0) { \
551405c4 517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
518 } else { \
519 RExC_offsets[2*(node)-1] = (byte); \
520 } \
521 } \
522} STMT_END
523
524#define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
527
528#define Set_Node_Length_To_R(node,len) STMT_START { \
529 if (! SIZE_ONLY) { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 531 __LINE__, (int)(node), (int)(len))); \
ccb2c380 532 if((node) < 0) { \
551405c4 533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
534 } else { \
535 RExC_offsets[2*(node)] = (len); \
536 } \
537 } \
538} STMT_END
539
540#define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543#define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
545
546/* Get offsets and lengths */
547#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
549
07be1b83
YO
550#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
553} STMT_END
554
555
556#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557#define EXPERIMENTAL_INPLACESCAN
558#endif
559
1de06328 560#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 561DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 562 PerlIO_printf(Perl_debug_log, \
b515a41d 563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
1de06328 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3
LW
617 }
618 else {
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
9051bda5
HS
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
1de06328 645 DEBUG_STUDYDATA(data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
3dab1dad
YO
801 dump_trie(trie)
802 dump_trie_interim_list(trie,next_alloc)
803 dump_trie_interim_table(trie,next_alloc)
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
816 dump_trie(trie)
817 Dumps the final compressed table form of the trie to Perl_debug_log.
818 Used for debugging make_trie().
819*/
820
821STATIC void
822S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
823{
824 U32 state;
ab3bbdeb
YO
825 SV *sv=sv_newmortal();
826 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 835 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
897 dump_trie_interim_list(trie,next_alloc)
898 Dumps a fully constructed but uncompressed trie in list form.
899 List tries normally only are used for construction when the number of
900 possible chars (trie->uniquecharcount) is very high.
901 Used for debugging make_trie().
902*/
903STATIC void
904S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
905{
906 U32 state;
ab3bbdeb
YO
907 SV *sv=sv_newmortal();
908 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
913 "------:-----+-----------------\n" );
3dab1dad
YO
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 928 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
929 if ( tmp ) {
930 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
931 colwidth,
ddc5bc0f 932 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
933 PL_colors[0], PL_colors[1],
934 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
935 PERL_PV_ESCAPE_FIRSTCHAR
936 ) ,
1e2e3d02
YO
937 TRIE_LIST_ITEM(state,charid).forid,
938 (UV)TRIE_LIST_ITEM(state,charid).newstate
939 );
940 if (!(charid % 10))
664e119d
RGS
941 PerlIO_printf(Perl_debug_log, "\n%*s| ",
942 (int)((depth * 2) + 14), "");
1e2e3d02 943 }
ab3bbdeb
YO
944 }
945 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
946 }
947}
948
949/*
950 dump_trie_interim_table(trie,next_alloc)
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
958{
959 U32 state;
960 U16 charid;
ab3bbdeb
YO
961 SV *sv=sv_newmortal();
962 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
963 GET_RE_DEBUG_FLAGS_DECL;
964
965 /*
966 print out the table precompression so that we can do a visual check
967 that they are identical.
968 */
969
970 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
971
972 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 973 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 974 if ( tmp ) {
ab3bbdeb
YO
975 PerlIO_printf( Perl_debug_log, "%*s",
976 colwidth,
ddc5bc0f 977 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
978 PL_colors[0], PL_colors[1],
979 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
980 PERL_PV_ESCAPE_FIRSTCHAR
981 )
982 );
3dab1dad
YO
983 }
984 }
985
986 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
987
988 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 989 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n" );
993
994 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
995
996 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
997 (int)depth * 2 + 2,"",
998 (UV)TRIE_NODENUM( state ) );
999
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1001 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1002 if (v)
1003 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1004 else
1005 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1006 }
1007 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1009 } else {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1011 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1012 }
1013 }
07be1b83 1014}
3dab1dad
YO
1015
1016#endif
1017
786e8c11
YO
1018/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1019 startbranch: the first branch in the whole branch sequence
1020 first : start branch of sequence of branch-exact nodes.
1021 May be the same as startbranch
1022 last : Thing following the last branch.
1023 May be the same as tail.
1024 tail : item following the branch sequence
1025 count : words in the sequence
1026 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1027 depth : indent depth
3dab1dad 1028
786e8c11 1029Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1030
786e8c11
YO
1031A trie is an N'ary tree where the branches are determined by digital
1032decomposition of the key. IE, at the root node you look up the 1st character and
1033follow that branch repeat until you find the end of the branches. Nodes can be
1034marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1035
786e8c11 1036 /he|she|his|hers/
72f13be8 1037
786e8c11
YO
1038would convert into the following structure. Numbers represent states, letters
1039following numbers represent valid transitions on the letter from that state, if
1040the number is in square brackets it represents an accepting state, otherwise it
1041will be in parenthesis.
07be1b83 1042
786e8c11
YO
1043 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1044 | |
1045 | (2)
1046 | |
1047 (1) +-i->(6)-+-s->[7]
1048 |
1049 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1050
786e8c11
YO
1051 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1052
1053This shows that when matching against the string 'hers' we will begin at state 1
1054read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1055then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1056is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1057single traverse. We store a mapping from accepting to state to which word was
1058matched, and then when we have multiple possibilities we try to complete the
1059rest of the regex in the order in which they occured in the alternation.
1060
1061The only prior NFA like behaviour that would be changed by the TRIE support is
1062the silent ignoring of duplicate alternations which are of the form:
1063
1064 / (DUPE|DUPE) X? (?{ ... }) Y /x
1065
1066Thus EVAL blocks follwing a trie may be called a different number of times with
1067and without the optimisation. With the optimisations dupes will be silently
1068ignored. This inconsistant behaviour of EVAL type nodes is well established as
1069the following demonstrates:
1070
1071 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1072
1073which prints out 'word' three times, but
1074
1075 'words'=~/(word|word|word)(?{ print $1 })S/
1076
1077which doesnt print it out at all. This is due to other optimisations kicking in.
1078
1079Example of what happens on a structural level:
1080
1081The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1082
1083 1: CURLYM[1] {1,32767}(18)
1084 5: BRANCH(8)
1085 6: EXACT <ac>(16)
1086 8: BRANCH(11)
1087 9: EXACT <ad>(16)
1088 11: BRANCH(14)
1089 12: EXACT <ab>(16)
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094This would be optimizable with startbranch=5, first=5, last=16, tail=16
1095and should turn into:
1096
1097 1: CURLYM[1] {1,32767}(18)
1098 5: TRIE(16)
1099 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1100 <ac>
1101 <ad>
1102 <ab>
1103 16: SUCCEED(0)
1104 17: NOTHING(18)
1105 18: END(0)
1106
1107Cases where tail != last would be like /(?foo|bar)baz/:
1108
1109 1: BRANCH(4)
1110 2: EXACT <foo>(8)
1111 4: BRANCH(7)
1112 5: EXACT <bar>(8)
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117which would be optimizable with startbranch=1, first=1, last=7, tail=8
1118and would end up looking like:
1119
1120 1: TRIE(8)
1121 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1122 <foo>
1123 <bar>
1124 7: TAIL(8)
1125 8: EXACT <baz>(10)
1126 10: END(0)
1127
1128 d = uvuni_to_utf8_flags(d, uv, 0);
1129
1130is the recommended Unicode-aware way of saying
1131
1132 *(d++) = uv;
1133*/
1134
1e2e3d02 1135#define TRIE_STORE_REVCHAR \
786e8c11 1136 STMT_START { \
1e2e3d02 1137 SV *tmp = newSVpvs(""); \
786e8c11 1138 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1139 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1140 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1141 } STMT_END
1142
1143#define TRIE_READ_CHAR STMT_START { \
1144 wordlen++; \
1145 if ( UTF ) { \
1146 if ( folder ) { \
1147 if ( foldlen > 0 ) { \
1148 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1149 foldlen -= len; \
1150 scan += len; \
1151 len = 0; \
1152 } else { \
1153 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1154 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 scan = foldbuf + UNISKIP( uvc ); \
1157 } \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 } \
1161 } else { \
1162 uvc = (U32)*uc; \
1163 len = 1; \
1164 } \
1165} STMT_END
1166
1167
1168
1169#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1170 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1171 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1172 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1173 } \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1175 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1176 TRIE_LIST_CUR( state )++; \
1177} STMT_END
07be1b83 1178
786e8c11
YO
1179#define TRIE_LIST_NEW(state) STMT_START { \
1180 Newxz( trie->states[ state ].trans.list, \
1181 4, reg_trie_trans_le ); \
1182 TRIE_LIST_CUR( state ) = 1; \
1183 TRIE_LIST_LEN( state ) = 4; \
1184} STMT_END
07be1b83 1185
786e8c11
YO
1186#define TRIE_HANDLE_WORD(state) STMT_START { \
1187 U16 dupe= trie->states[ state ].wordnum; \
1188 regnode * const noper_next = regnext( noper ); \
1189 \
1190 if (trie->wordlen) \
1191 trie->wordlen[ curword ] = wordlen; \
1192 DEBUG_r({ \
1193 /* store the word for dumping */ \
1194 SV* tmp; \
1195 if (OP(noper) != NOTHING) \
1196 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1197 else \
1198 tmp = newSVpvn( "", 0 ); \
1199 if ( UTF ) SvUTF8_on( tmp ); \
1200 av_push( trie->words, tmp ); \
1201 }); \
1202 \
1203 curword++; \
1204 \
1205 if ( noper_next < tail ) { \
1206 if (!trie->jump) \
446bd890 1207 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1208 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1209 if (!jumper) \
1210 jumper = noper_next; \
1211 if (!nextbranch) \
1212 nextbranch= regnext(cur); \
1213 } \
1214 \
1215 if ( dupe ) { \
1216 /* So it's a dupe. This means we need to maintain a */\
1217 /* linked-list from the first to the next. */\
1218 /* we only allocate the nextword buffer when there */\
1219 /* a dupe, so first time we have to do the allocation */\
1220 if (!trie->nextword) \
446bd890
NC
1221 trie->nextword = \
1222 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1223 while ( trie->nextword[dupe] ) \
1224 dupe= trie->nextword[dupe]; \
1225 trie->nextword[dupe]= curword; \
1226 } else { \
1227 /* we haven't inserted this word yet. */ \
1228 trie->states[ state ].wordnum = curword; \
1229 } \
1230} STMT_END
07be1b83 1231
3dab1dad 1232
786e8c11
YO
1233#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1234 ( ( base + charid >= ucharcount \
1235 && base + charid < ubound \
1236 && state == trie->trans[ base - ucharcount + charid ].check \
1237 && trie->trans[ base - ucharcount + charid ].next ) \
1238 ? trie->trans[ base - ucharcount + charid ].next \
1239 : ( state==1 ? special : 0 ) \
1240 )
3dab1dad 1241
786e8c11
YO
1242#define MADE_TRIE 1
1243#define MADE_JUMP_TRIE 2
1244#define MADE_EXACT_TRIE 4
3dab1dad 1245
a3621e74 1246STATIC I32
786e8c11 1247S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1248{
27da23d5 1249 dVAR;
a3621e74
YO
1250 /* first pass, loop through and scan words */
1251 reg_trie_data *trie;
1252 regnode *cur;
9f7f3913 1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1254 STRLEN len = 0;
1255 UV uvc = 0;
1256 U16 curword = 0;
1257 U32 next_alloc = 0;
786e8c11
YO
1258 regnode *jumper = NULL;
1259 regnode *nextbranch = NULL;
7f69552c 1260 regnode *convert = NULL;
a3621e74 1261 /* we just use folder as a flag in utf8 */
e1ec3a88 1262 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1263 ? PL_fold
1264 : ( flags == EXACTFL
1265 ? PL_fold_locale
1266 : NULL
1267 )
1268 );
1269
e1ec3a88 1270 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1271 SV *re_trie_maxbuff;
3dab1dad
YO
1272#ifndef DEBUGGING
1273 /* these are only used during construction but are useful during
8e11feef 1274 * debugging so we store them in the struct when debugging.
8e11feef 1275 */
3dab1dad 1276 STRLEN trie_charcount=0;
3dab1dad
YO
1277 AV *trie_revcharmap;
1278#endif
a3621e74 1279 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1280#ifndef DEBUGGING
1281 PERL_UNUSED_ARG(depth);
1282#endif
a3621e74 1283
446bd890 1284 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1285 trie->refcount = 1;
3dab1dad 1286 trie->startstate = 1;
786e8c11 1287 trie->wordcount = word_count;
f8fc2ecf 1288 RExC_rxi->data->data[ data_slot ] = (void*)trie;
446bd890 1289 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1290 if (!(UTF && folder))
446bd890 1291 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74
YO
1292 DEBUG_r({
1293 trie->words = newAV();
a3621e74 1294 });
3dab1dad 1295 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1296
0111c4fd 1297 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1298 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1299 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1300 }
3dab1dad
YO
1301 DEBUG_OPTIMISE_r({
1302 PerlIO_printf( Perl_debug_log,
786e8c11 1303 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1304 (int)depth * 2 + 2, "",
1305 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1306 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1307 (int)depth);
3dab1dad 1308 });
7f69552c
YO
1309
1310 /* Find the node we are going to overwrite */
1311 if ( first == startbranch && OP( last ) != BRANCH ) {
1312 /* whole branch chain */
1313 convert = first;
1314 } else {
1315 /* branch sub-chain */
1316 convert = NEXTOPER( first );
1317 }
1318
a3621e74
YO
1319 /* -- First loop and Setup --
1320
1321 We first traverse the branches and scan each word to determine if it
1322 contains widechars, and how many unique chars there are, this is
1323 important as we have to build a table with at least as many columns as we
1324 have unique chars.
1325
1326 We use an array of integers to represent the character codes 0..255
1327 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1328 native representation of the character value as the key and IV's for the
1329 coded index.
1330
1331 *TODO* If we keep track of how many times each character is used we can
1332 remap the columns so that the table compression later on is more
1333 efficient in terms of memory by ensuring most common value is in the
1334 middle and the least common are on the outside. IMO this would be better
1335 than a most to least common mapping as theres a decent chance the most
1336 common letter will share a node with the least common, meaning the node
1337 will not be compressable. With a middle is most common approach the worst
1338 case is when we have the least common nodes twice.
1339
1340 */
1341
a3621e74 1342 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1343 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1344 const U8 *uc = (U8*)STRING( noper );
a28509cc 1345 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1346 STRLEN foldlen = 0;
1347 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1348 const U8 *scan = (U8*)NULL;
07be1b83 1349 U32 wordlen = 0; /* required init */
3dab1dad 1350 STRLEN chars=0;
a3621e74 1351
3dab1dad
YO
1352 if (OP(noper) == NOTHING) {
1353 trie->minlen= 0;
1354 continue;
1355 }
1356 if (trie->bitmap) {
1357 TRIE_BITMAP_SET(trie,*uc);
1358 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1359 }
a3621e74 1360 for ( ; uc < e ; uc += len ) {
3dab1dad 1361 TRIE_CHARCOUNT(trie)++;
a3621e74 1362 TRIE_READ_CHAR;
3dab1dad 1363 chars++;
a3621e74
YO
1364 if ( uvc < 256 ) {
1365 if ( !trie->charmap[ uvc ] ) {
1366 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1367 if ( folder )
1368 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 } else {
1372 SV** svpp;
1373 if ( !trie->widecharmap )
1374 trie->widecharmap = newHV();
1375
1376 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1377
1378 if ( !svpp )
e4584336 1379 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1380
1381 if ( !SvTRUE( *svpp ) ) {
1382 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1383 TRIE_STORE_REVCHAR;
a3621e74
YO
1384 }
1385 }
1386 }
3dab1dad
YO
1387 if( cur == first ) {
1388 trie->minlen=chars;
1389 trie->maxlen=chars;
1390 } else if (chars < trie->minlen) {
1391 trie->minlen=chars;
1392 } else if (chars > trie->maxlen) {
1393 trie->maxlen=chars;
1394 }
1395
a3621e74
YO
1396 } /* end first pass */
1397 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1398 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1399 (int)depth * 2 + 2,"",
85c3142d 1400 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1401 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1402 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1403 );
446bd890 1404 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1405
1406 /*
1407 We now know what we are dealing with in terms of unique chars and
1408 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1409 representation using a flat table will take. If it's over a reasonable
1410 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1411 conservative but potentially much slower representation using an array
1412 of lists.
1413
1414 At the end we convert both representations into the same compressed
1415 form that will be used in regexec.c for matching with. The latter
1416 is a form that cannot be used to construct with but has memory
1417 properties similar to the list form and access properties similar
1418 to the table form making it both suitable for fast searches and
1419 small enough that its feasable to store for the duration of a program.
1420
1421 See the comment in the code where the compressed table is produced
1422 inplace from the flat tabe representation for an explanation of how
1423 the compression works.
1424
1425 */
1426
1427
3dab1dad 1428 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1429 /*
1430 Second Pass -- Array Of Lists Representation
1431
1432 Each state will be represented by a list of charid:state records
1433 (reg_trie_trans_le) the first such element holds the CUR and LEN
1434 points of the allocated array. (See defines above).
1435
1436 We build the initial structure using the lists, and then convert
1437 it into the compressed table form which allows faster lookups
1438 (but cant be modified once converted).
a3621e74
YO
1439 */
1440
a3621e74
YO
1441 STRLEN transcount = 1;
1442
1e2e3d02
YO
1443 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1444 "%*sCompiling trie using list compiler\n",
1445 (int)depth * 2 + 2, ""));
446bd890
NC
1446
1447 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1448 sizeof(reg_trie_state) );
a3621e74
YO
1449 TRIE_LIST_NEW(1);
1450 next_alloc = 2;
1451
1452 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1453
c445ea15
AL
1454 regnode * const noper = NEXTOPER( cur );
1455 U8 *uc = (U8*)STRING( noper );
1456 const U8 * const e = uc + STR_LEN( noper );
1457 U32 state = 1; /* required init */
1458 U16 charid = 0; /* sanity init */
1459 U8 *scan = (U8*)NULL; /* sanity init */
1460 STRLEN foldlen = 0; /* required init */
07be1b83 1461 U32 wordlen = 0; /* required init */
c445ea15
AL
1462 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1463
3dab1dad 1464 if (OP(noper) != NOTHING) {
786e8c11 1465 for ( ; uc < e ; uc += len ) {
c445ea15 1466
786e8c11 1467 TRIE_READ_CHAR;
c445ea15 1468
786e8c11
YO
1469 if ( uvc < 256 ) {
1470 charid = trie->charmap[ uvc ];
c445ea15 1471 } else {
786e8c11
YO
1472 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1473 if ( !svpp ) {
1474 charid = 0;
1475 } else {
1476 charid=(U16)SvIV( *svpp );
1477 }
c445ea15 1478 }
786e8c11
YO
1479 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1480 if ( charid ) {
a3621e74 1481
786e8c11
YO
1482 U16 check;
1483 U32 newstate = 0;
a3621e74 1484
786e8c11
YO
1485 charid--;
1486 if ( !trie->states[ state ].trans.list ) {
1487 TRIE_LIST_NEW( state );
c445ea15 1488 }
786e8c11
YO
1489 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1490 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1491 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1492 break;
1493 }
1494 }
1495 if ( ! newstate ) {
1496 newstate = next_alloc++;
1497 TRIE_LIST_PUSH( state, charid, newstate );
1498 transcount++;
1499 }
1500 state = newstate;
1501 } else {
1502 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1503 }
a28509cc 1504 }
c445ea15 1505 }
3dab1dad 1506 TRIE_HANDLE_WORD(state);
a3621e74
YO
1507
1508 } /* end second pass */
1509
1e2e3d02
YO
1510 /* next alloc is the NEXT state to be allocated */
1511 trie->statecount = next_alloc;
446bd890
NC
1512 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1513 * sizeof(reg_trie_state) );
a3621e74 1514
3dab1dad
YO
1515 /* and now dump it out before we compress it */
1516 DEBUG_TRIE_COMPILE_MORE_r(
1517 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1518 );
a3621e74 1519
446bd890
NC
1520 trie->trans
1521 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1522 {
1523 U32 state;
a3621e74
YO
1524 U32 tp = 0;
1525 U32 zp = 0;
1526
1527
1528 for( state=1 ; state < next_alloc ; state ++ ) {
1529 U32 base=0;
1530
1531 /*
1532 DEBUG_TRIE_COMPILE_MORE_r(
1533 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1534 );
1535 */
1536
1537 if (trie->states[state].trans.list) {
1538 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1539 U16 maxid=minid;
a28509cc 1540 U16 idx;
a3621e74
YO
1541
1542 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1543 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1544 if ( forid < minid ) {
1545 minid=forid;
1546 } else if ( forid > maxid ) {
1547 maxid=forid;
1548 }
a3621e74
YO
1549 }
1550 if ( transcount < tp + maxid - minid + 1) {
1551 transcount *= 2;
446bd890
NC
1552 trie->trans
1553 = PerlMemShared_realloc( trie->trans,
1554 transcount
1555 * sizeof(reg_trie_trans) );
a3621e74
YO
1556 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1557 }
1558 base = trie->uniquecharcount + tp - minid;
1559 if ( maxid == minid ) {
1560 U32 set = 0;
1561 for ( ; zp < tp ; zp++ ) {
1562 if ( ! trie->trans[ zp ].next ) {
1563 base = trie->uniquecharcount + zp - minid;
1564 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ zp ].check = state;
1566 set = 1;
1567 break;
1568 }
1569 }
1570 if ( !set ) {
1571 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1572 trie->trans[ tp ].check = state;
1573 tp++;
1574 zp = tp;
1575 }
1576 } else {
1577 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1578 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1579 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1580 trie->trans[ tid ].check = state;
1581 }
1582 tp += ( maxid - minid + 1 );
1583 }
1584 Safefree(trie->states[ state ].trans.list);
1585 }
1586 /*
1587 DEBUG_TRIE_COMPILE_MORE_r(
1588 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1589 );
1590 */
1591 trie->states[ state ].trans.base=base;
1592 }
cc601c31 1593 trie->lasttrans = tp + 1;
a3621e74
YO
1594 }
1595 } else {
1596 /*
1597 Second Pass -- Flat Table Representation.
1598
1599 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1600 We know that we will need Charcount+1 trans at most to store the data
1601 (one row per char at worst case) So we preallocate both structures
1602 assuming worst case.
1603
1604 We then construct the trie using only the .next slots of the entry
1605 structs.
1606
1607 We use the .check field of the first entry of the node temporarily to
1608 make compression both faster and easier by keeping track of how many non
1609 zero fields are in the node.
1610
1611 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1612 transition.
1613
1614 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1615 number representing the first entry of the node, and state as a
1616 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1617 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1618 are 2 entrys per node. eg:
1619
1620 A B A B
1621 1. 2 4 1. 3 7
1622 2. 0 3 3. 0 5
1623 3. 0 0 5. 0 0
1624 4. 0 0 7. 0 0
1625
1626 The table is internally in the right hand, idx form. However as we also
1627 have to deal with the states array which is indexed by nodenum we have to
1628 use TRIE_NODENUM() to convert.
1629
1630 */
1e2e3d02
YO
1631 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1632 "%*sCompiling trie using table compiler\n",
1633 (int)depth * 2 + 2, ""));
3dab1dad 1634
446bd890
NC
1635 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1636 * trie->uniquecharcount + 1,
1637 sizeof(reg_trie_trans) );
1638 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1639 sizeof(reg_trie_state) );
a3621e74
YO
1640 next_alloc = trie->uniquecharcount + 1;
1641
3dab1dad 1642
a3621e74
YO
1643 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1644
c445ea15 1645 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1646 const U8 *uc = (U8*)STRING( noper );
1647 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1648
1649 U32 state = 1; /* required init */
1650
1651 U16 charid = 0; /* sanity init */
1652 U32 accept_state = 0; /* sanity init */
1653 U8 *scan = (U8*)NULL; /* sanity init */
1654
1655 STRLEN foldlen = 0; /* required init */
07be1b83 1656 U32 wordlen = 0; /* required init */
a3621e74
YO
1657 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658
3dab1dad 1659 if ( OP(noper) != NOTHING ) {
786e8c11 1660 for ( ; uc < e ; uc += len ) {
a3621e74 1661
786e8c11 1662 TRIE_READ_CHAR;
a3621e74 1663
786e8c11
YO
1664 if ( uvc < 256 ) {
1665 charid = trie->charmap[ uvc ];
1666 } else {
1667 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1668 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1669 }
786e8c11
YO
1670 if ( charid ) {
1671 charid--;
1672 if ( !trie->trans[ state + charid ].next ) {
1673 trie->trans[ state + charid ].next = next_alloc;
1674 trie->trans[ state ].check++;
1675 next_alloc += trie->uniquecharcount;
1676 }
1677 state = trie->trans[ state + charid ].next;
1678 } else {
1679 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1680 }
1681 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1682 }
a3621e74 1683 }
3dab1dad
YO
1684 accept_state = TRIE_NODENUM( state );
1685 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1686
1687 } /* end second pass */
1688
3dab1dad
YO
1689 /* and now dump it out before we compress it */
1690 DEBUG_TRIE_COMPILE_MORE_r(
1691 dump_trie_interim_table(trie,next_alloc,depth+1)
1692 );
a3621e74 1693
a3621e74
YO
1694 {
1695 /*
1696 * Inplace compress the table.*
1697
1698 For sparse data sets the table constructed by the trie algorithm will
1699 be mostly 0/FAIL transitions or to put it another way mostly empty.
1700 (Note that leaf nodes will not contain any transitions.)
1701
1702 This algorithm compresses the tables by eliminating most such
1703 transitions, at the cost of a modest bit of extra work during lookup:
1704
1705 - Each states[] entry contains a .base field which indicates the
1706 index in the state[] array wheres its transition data is stored.
1707
1708 - If .base is 0 there are no valid transitions from that node.
1709
1710 - If .base is nonzero then charid is added to it to find an entry in
1711 the trans array.
1712
1713 -If trans[states[state].base+charid].check!=state then the
1714 transition is taken to be a 0/Fail transition. Thus if there are fail
1715 transitions at the front of the node then the .base offset will point
1716 somewhere inside the previous nodes data (or maybe even into a node
1717 even earlier), but the .check field determines if the transition is
1718 valid.
1719
786e8c11 1720 XXX - wrong maybe?
a3621e74
YO
1721 The following process inplace converts the table to the compressed
1722 table: We first do not compress the root node 1,and mark its all its
1723 .check pointers as 1 and set its .base pointer as 1 as well. This
1724 allows to do a DFA construction from the compressed table later, and
1725 ensures that any .base pointers we calculate later are greater than
1726 0.
1727
1728 - We set 'pos' to indicate the first entry of the second node.
1729
1730 - We then iterate over the columns of the node, finding the first and
1731 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1732 and set the .check pointers accordingly, and advance pos
1733 appropriately and repreat for the next node. Note that when we copy
1734 the next pointers we have to convert them from the original
1735 NODEIDX form to NODENUM form as the former is not valid post
1736 compression.
1737
1738 - If a node has no transitions used we mark its base as 0 and do not
1739 advance the pos pointer.
1740
1741 - If a node only has one transition we use a second pointer into the
1742 structure to fill in allocated fail transitions from other states.
1743 This pointer is independent of the main pointer and scans forward
1744 looking for null transitions that are allocated to a state. When it
1745 finds one it writes the single transition into the "hole". If the
786e8c11 1746 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1747
1748 - Once compressed we can Renew/realloc the structures to release the
1749 excess space.
1750
1751 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1752 specifically Fig 3.47 and the associated pseudocode.
1753
1754 demq
1755 */
a3b680e6 1756 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1757 U32 state, charid;
a3621e74 1758 U32 pos = 0, zp=0;
1e2e3d02 1759 trie->statecount = laststate;
a3621e74
YO
1760
1761 for ( state = 1 ; state < laststate ; state++ ) {
1762 U8 flag = 0;
a28509cc
AL
1763 const U32 stateidx = TRIE_NODEIDX( state );
1764 const U32 o_used = trie->trans[ stateidx ].check;
1765 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1766 trie->trans[ stateidx ].check = 0;
1767
1768 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1769 if ( flag || trie->trans[ stateidx + charid ].next ) {
1770 if ( trie->trans[ stateidx + charid ].next ) {
1771 if (o_used == 1) {
1772 for ( ; zp < pos ; zp++ ) {
1773 if ( ! trie->trans[ zp ].next ) {
1774 break;
1775 }
1776 }
1777 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1778 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ zp ].check = state;
1780 if ( ++zp > pos ) pos = zp;
1781 break;
1782 }
1783 used--;
1784 }
1785 if ( !flag ) {
1786 flag = 1;
1787 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1788 }
1789 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1790 trie->trans[ pos ].check = state;
1791 pos++;
1792 }
1793 }
1794 }
cc601c31 1795 trie->lasttrans = pos + 1;
446bd890
NC
1796 trie->states = PerlMemShared_realloc( trie->states, laststate
1797 * sizeof(reg_trie_state) );
a3621e74 1798 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1799 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1800 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1801 (int)depth * 2 + 2,"",
1802 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1803 (IV)next_alloc,
1804 (IV)pos,
a3621e74
YO
1805 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1806 );
1807
1808 } /* end table compress */
1809 }
1e2e3d02
YO
1810 DEBUG_TRIE_COMPILE_MORE_r(
1811 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1812 (int)depth * 2 + 2, "",
1813 (UV)trie->statecount,
1814 (UV)trie->lasttrans)
1815 );
cc601c31 1816 /* resize the trans array to remove unused space */
446bd890
NC
1817 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1818 * sizeof(reg_trie_trans) );
a3621e74 1819
3dab1dad
YO
1820 /* and now dump out the compressed format */
1821 DEBUG_TRIE_COMPILE_r(
1822 dump_trie(trie,depth+1)
1823 );
07be1b83 1824
3dab1dad 1825 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1826 U8 nodetype =(U8)(flags & 0xFF);
1827 char *str=NULL;
786e8c11 1828
07be1b83 1829#ifdef DEBUGGING
e62cc96a 1830 regnode *optimize = NULL;
b57a0404
JH
1831 U32 mjd_offset = 0;
1832 U32 mjd_nodelen = 0;
07be1b83 1833#endif
a3621e74 1834 /*
3dab1dad
YO
1835 This means we convert either the first branch or the first Exact,
1836 depending on whether the thing following (in 'last') is a branch
1837 or not and whther first is the startbranch (ie is it a sub part of
1838 the alternation or is it the whole thing.)
1839 Assuming its a sub part we conver the EXACT otherwise we convert
1840 the whole branch sequence, including the first.
a3621e74 1841 */
3dab1dad 1842 /* Find the node we are going to overwrite */
7f69552c 1843 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1844 /* branch sub-chain */
3dab1dad 1845 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1846 DEBUG_r({
1847 mjd_offset= Node_Offset((convert));
1848 mjd_nodelen= Node_Length((convert));
1849 });
7f69552c
YO
1850 /* whole branch chain */
1851 } else {
1852 DEBUG_r({
1853 const regnode *nop = NEXTOPER( convert );
1854 mjd_offset= Node_Offset((nop));
1855 mjd_nodelen= Node_Length((nop));
1856 });
07be1b83 1857 }
7f69552c 1858
07be1b83
YO
1859 DEBUG_OPTIMISE_r(
1860 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1861 (int)depth * 2 + 2, "",
786e8c11 1862 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1863 );
a3621e74 1864
3dab1dad
YO
1865 /* But first we check to see if there is a common prefix we can
1866 split out as an EXACT and put in front of the TRIE node. */
1867 trie->startstate= 1;
786e8c11 1868 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1869 U32 state;
1e2e3d02 1870 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1871 U32 ofs = 0;
8e11feef
RGS
1872 I32 idx = -1;
1873 U32 count = 0;
1874 const U32 base = trie->states[ state ].trans.base;
a3621e74 1875
3dab1dad 1876 if ( trie->states[state].wordnum )
8e11feef 1877 count = 1;
a3621e74 1878
8e11feef 1879 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1880 if ( ( base + ofs >= trie->uniquecharcount ) &&
1881 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1882 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1883 {
3dab1dad 1884 if ( ++count > 1 ) {
8e11feef 1885 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1886 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1887 if ( state == 1 ) break;
3dab1dad
YO
1888 if ( count == 2 ) {
1889 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1890 DEBUG_OPTIMISE_r(
8e11feef
RGS
1891 PerlIO_printf(Perl_debug_log,
1892 "%*sNew Start State=%"UVuf" Class: [",
1893 (int)depth * 2 + 2, "",
786e8c11 1894 (UV)state));
be8e71aa
YO
1895 if (idx >= 0) {
1896 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1898
3dab1dad 1899 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1900 if ( folder )
1901 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1902 DEBUG_OPTIMISE_r(
07be1b83 1903 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1904 );
8e11feef
RGS
1905 }
1906 }
1907 TRIE_BITMAP_SET(trie,*ch);
1908 if ( folder )
1909 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1910 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1911 }
1912 idx = ofs;
1913 }
3dab1dad
YO
1914 }
1915 if ( count == 1 ) {
1916 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1917 char *ch = SvPV_nolen( *tmp );
1918 DEBUG_OPTIMISE_r({
1919 SV *sv=sv_newmortal();
8e11feef
RGS
1920 PerlIO_printf( Perl_debug_log,
1921 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1922 (int)depth * 2 + 2, "",
de734bd5
A
1923 (UV)state, (UV)idx,
1924 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1925 PL_colors[0], PL_colors[1],
1926 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1927 PERL_PV_ESCAPE_FIRSTCHAR
1928 )
1929 );
1930 });
3dab1dad
YO
1931 if ( state==1 ) {
1932 OP( convert ) = nodetype;
1933 str=STRING(convert);
1934 STR_LEN(convert)=0;
1935 }
de734bd5
A
1936 while (*ch) {
1937 *str++ = *ch++;
1938 STR_LEN(convert)++;
1939 }
1940
8e11feef 1941 } else {
f9049ba1 1942#ifdef DEBUGGING
8e11feef
RGS
1943 if (state>1)
1944 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1945#endif
8e11feef
RGS
1946 break;
1947 }
1948 }
3dab1dad 1949 if (str) {
8e11feef 1950 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1951 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1952 trie->startstate = state;
07be1b83
YO
1953 trie->minlen -= (state - 1);
1954 trie->maxlen -= (state - 1);
1955 DEBUG_r({
1956 regnode *fix = convert;
de734bd5 1957 U32 word = trie->wordcount;
07be1b83
YO
1958 mjd_nodelen++;
1959 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1960 while( ++fix < n ) {
1961 Set_Node_Offset_Length(fix, 0, 0);
1962 }
de734bd5
A
1963 while (word--) {
1964 SV ** const tmp = av_fetch( trie->words, word, 0 );
1965 if (tmp) {
1966 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1967 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1968 else
1969 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1970 }
1971 }
07be1b83 1972 });
8e11feef
RGS
1973 if (trie->maxlen) {
1974 convert = n;
1975 } else {
3dab1dad 1976 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1977 DEBUG_r(optimize= n);
3dab1dad
YO
1978 }
1979 }
1980 }
a5ca303d
YO
1981 if (!jumper)
1982 jumper = last;
3dab1dad 1983 if ( trie->maxlen ) {
8e11feef
RGS
1984 NEXT_OFF( convert ) = (U16)(tail - convert);
1985 ARG_SET( convert, data_slot );
786e8c11
YO
1986 /* Store the offset to the first unabsorbed branch in
1987 jump[0], which is otherwise unused by the jump logic.
1988 We use this when dumping a trie and during optimisation. */
1989 if (trie->jump)
7f69552c 1990 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1991
786e8c11
YO
1992 /* XXXX */
1993 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1994 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1995 {
1996 OP( convert ) = TRIEC;
1997 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 1998 PerlMemShared_free(trie->bitmap);
786e8c11
YO
1999 trie->bitmap= NULL;
2000 } else
2001 OP( convert ) = TRIE;
a3621e74 2002
3dab1dad
YO
2003 /* store the type in the flags */
2004 convert->flags = nodetype;
a5ca303d
YO
2005 DEBUG_r({
2006 optimize = convert
2007 + NODE_STEP_REGNODE
2008 + regarglen[ OP( convert ) ];
2009 });
2010 /* XXX We really should free up the resource in trie now,
2011 as we won't use them - (which resources?) dmq */
3dab1dad 2012 }
a3621e74 2013 /* needed for dumping*/
e62cc96a 2014 DEBUG_r(if (optimize) {
07be1b83 2015 regnode *opt = convert;
e62cc96a 2016 while ( ++opt < optimize) {
07be1b83
YO
2017 Set_Node_Offset_Length(opt,0,0);
2018 }
786e8c11
YO
2019 /*
2020 Try to clean up some of the debris left after the
2021 optimisation.
a3621e74 2022 */
786e8c11 2023 while( optimize < jumper ) {
07be1b83 2024 mjd_nodelen += Node_Length((optimize));
a3621e74 2025 OP( optimize ) = OPTIMIZED;
07be1b83 2026 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2027 optimize++;
2028 }
07be1b83 2029 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2030 });
2031 } /* end node insert */
07be1b83 2032#ifndef DEBUGGING
6e8b4190 2033 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2034#endif
786e8c11
YO
2035 return trie->jump
2036 ? MADE_JUMP_TRIE
2037 : trie->startstate>1
2038 ? MADE_EXACT_TRIE
2039 : MADE_TRIE;
2040}
2041
2042STATIC void
2043S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2044{
2045/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2046
2047 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2048 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2049 ISBN 0-201-10088-6
2050
2051 We find the fail state for each state in the trie, this state is the longest proper
2052 suffix of the current states 'word' that is also a proper prefix of another word in our
2053 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2054 the DFA not to have to restart after its tried and failed a word at a given point, it
2055 simply continues as though it had been matching the other word in the first place.
2056 Consider
2057 'abcdgu'=~/abcdefg|cdgu/
2058 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2059 fail, which would bring use to the state representing 'd' in the second word where we would
2060 try 'g' and succeed, prodceding to match 'cdgu'.
2061 */
2062 /* add a fail transition */
3251b653
NC
2063 const U32 trie_offset = ARG(source);
2064 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2065 U32 *q;
2066 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2067 const U32 numstates = trie->statecount;
786e8c11
YO
2068 const U32 ubound = trie->lasttrans + ucharcount;
2069 U32 q_read = 0;
2070 U32 q_write = 0;
2071 U32 charid;
2072 U32 base = trie->states[ 1 ].trans.base;
2073 U32 *fail;
2074 reg_ac_data *aho;
2075 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2076 GET_RE_DEBUG_FLAGS_DECL;
2077#ifndef DEBUGGING
2078 PERL_UNUSED_ARG(depth);
2079#endif
2080
2081
2082 ARG_SET( stclass, data_slot );
446bd890 2083 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2084 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2085 aho->trie=trie_offset;
446bd890
NC
2086 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2087 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2088 Newxz( q, numstates, U32);
446bd890 2089 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2090 aho->refcount = 1;
2091 fail = aho->fail;
2092 /* initialize fail[0..1] to be 1 so that we always have
2093 a valid final fail state */
2094 fail[ 0 ] = fail[ 1 ] = 1;
2095
2096 for ( charid = 0; charid < ucharcount ; charid++ ) {
2097 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2098 if ( newstate ) {
2099 q[ q_write ] = newstate;
2100 /* set to point at the root */
2101 fail[ q[ q_write++ ] ]=1;
2102 }
2103 }
2104 while ( q_read < q_write) {
2105 const U32 cur = q[ q_read++ % numstates ];
2106 base = trie->states[ cur ].trans.base;
2107
2108 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2109 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2110 if (ch_state) {
2111 U32 fail_state = cur;
2112 U32 fail_base;
2113 do {
2114 fail_state = fail[ fail_state ];
2115 fail_base = aho->states[ fail_state ].trans.base;
2116 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2117
2118 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2119 fail[ ch_state ] = fail_state;
2120 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2121 {
2122 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2123 }
2124 q[ q_write++ % numstates] = ch_state;
2125 }
2126 }
2127 }
2128 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2129 when we fail in state 1, this allows us to use the
2130 charclass scan to find a valid start char. This is based on the principle
2131 that theres a good chance the string being searched contains lots of stuff
2132 that cant be a start char.
2133 */
2134 fail[ 0 ] = fail[ 1 ] = 0;
2135 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2136 PerlIO_printf(Perl_debug_log,
2137 "%*sStclass Failtable (%"UVuf" states): 0",
2138 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2139 );
786e8c11
YO
2140 for( q_read=1; q_read<numstates; q_read++ ) {
2141 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2142 }
2143 PerlIO_printf(Perl_debug_log, "\n");
2144 });
2145 Safefree(q);
2146 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2147}
2148
786e8c11 2149
a3621e74 2150/*
5d1c421c
JH
2151 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2152 * These need to be revisited when a newer toolchain becomes available.
2153 */
2154#if defined(__sparc64__) && defined(__GNUC__)
2155# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2156# undef SPARC64_GCC_WORKAROUND
2157# define SPARC64_GCC_WORKAROUND 1
2158# endif
2159#endif
2160
07be1b83 2161#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2162 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2163 SV * const mysv=sv_newmortal(); \
2164 regnode *Next = regnext(scan); \
2165 regprop(RExC_rx, mysv, scan); \
7f69552c 2166 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2167 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2168 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2169 }});
07be1b83 2170
1de06328
YO
2171
2172
2173
2174
07be1b83
YO
2175#define JOIN_EXACT(scan,min,flags) \
2176 if (PL_regkind[OP(scan)] == EXACT) \
2177 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2178
be8e71aa 2179STATIC U32
07be1b83
YO
2180S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2181 /* Merge several consecutive EXACTish nodes into one. */
2182 regnode *n = regnext(scan);
2183 U32 stringok = 1;
2184 regnode *next = scan + NODE_SZ_STR(scan);
2185 U32 merged = 0;
2186 U32 stopnow = 0;
2187#ifdef DEBUGGING
2188 regnode *stop = scan;
72f13be8 2189 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2190#else
d47053eb
RGS
2191 PERL_UNUSED_ARG(depth);
2192#endif
2193#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2194 PERL_UNUSED_ARG(flags);
2195 PERL_UNUSED_ARG(val);
07be1b83 2196#endif
07be1b83
YO
2197 DEBUG_PEEP("join",scan,depth);
2198
2199 /* Skip NOTHING, merge EXACT*. */
2200 while (n &&
2201 ( PL_regkind[OP(n)] == NOTHING ||
2202 (stringok && (OP(n) == OP(scan))))
2203 && NEXT_OFF(n)
2204 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2205
2206 if (OP(n) == TAIL || n > next)
2207 stringok = 0;
2208 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2209 DEBUG_PEEP("skip:",n,depth);
2210 NEXT_OFF(scan) += NEXT_OFF(n);
2211 next = n + NODE_STEP_REGNODE;
2212#ifdef DEBUGGING
2213 if (stringok)
2214 stop = n;
2215#endif
2216 n = regnext(n);
2217 }
2218 else if (stringok) {
786e8c11 2219 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2220 regnode * const nnext = regnext(n);
2221
2222 DEBUG_PEEP("merg",n,depth);
2223
2224 merged++;
2225 if (oldl + STR_LEN(n) > U8_MAX)
2226 break;
2227 NEXT_OFF(scan) += NEXT_OFF(n);
2228 STR_LEN(scan) += STR_LEN(n);
2229 next = n + NODE_SZ_STR(n);
2230 /* Now we can overwrite *n : */
2231 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2232#ifdef DEBUGGING
2233 stop = next - 1;
2234#endif
2235 n = nnext;
2236 if (stopnow) break;
2237 }
2238
d47053eb
RGS
2239#ifdef EXPERIMENTAL_INPLACESCAN
2240 if (flags && !NEXT_OFF(n)) {
2241 DEBUG_PEEP("atch", val, depth);
2242 if (reg_off_by_arg[OP(n)]) {
2243 ARG_SET(n, val - n);
2244 }
2245 else {
2246 NEXT_OFF(n) = val - n;
2247 }
2248 stopnow = 1;
2249 }
07be1b83
YO
2250#endif
2251 }
2252
2253 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2254 /*
2255 Two problematic code points in Unicode casefolding of EXACT nodes:
2256
2257 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2258 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2259
2260 which casefold to
2261
2262 Unicode UTF-8
2263
2264 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2265 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2266
2267 This means that in case-insensitive matching (or "loose matching",
2268 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2269 length of the above casefolded versions) can match a target string
2270 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2271 This would rather mess up the minimum length computation.
2272
2273 What we'll do is to look for the tail four bytes, and then peek
2274 at the preceding two bytes to see whether we need to decrease
2275 the minimum length by four (six minus two).
2276
2277 Thanks to the design of UTF-8, there cannot be false matches:
2278 A sequence of valid UTF-8 bytes cannot be a subsequence of
2279 another valid sequence of UTF-8 bytes.
2280
2281 */
2282 char * const s0 = STRING(scan), *s, *t;
2283 char * const s1 = s0 + STR_LEN(scan) - 1;
2284 char * const s2 = s1 - 4;
e294cc5d
JH
2285#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2286 const char t0[] = "\xaf\x49\xaf\x42";
2287#else
07be1b83 2288 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2289#endif
07be1b83
YO
2290 const char * const t1 = t0 + 3;
2291
2292 for (s = s0 + 2;
2293 s < s2 && (t = ninstr(s, s1, t0, t1));
2294 s = t + 4) {
e294cc5d
JH
2295#ifdef EBCDIC
2296 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2297 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2298#else
07be1b83
YO
2299 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2300 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2301#endif
07be1b83
YO
2302 *min -= 4;
2303 }
2304 }
2305
2306#ifdef DEBUGGING
2307 /* Allow dumping */
2308 n = scan + NODE_SZ_STR(scan);
2309 while (n <= stop) {
2310 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2311 OP(n) = OPTIMIZED;
2312 NEXT_OFF(n) = 0;
2313 }
2314 n++;
2315 }
2316#endif
2317 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2318 return stopnow;
2319}
2320
653099ff
GS
2321/* REx optimizer. Converts nodes into quickier variants "in place".
2322 Finds fixed substrings. */
2323
a0288114 2324/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2325 to the position after last scanned or to NULL. */
2326
40d049e4
YO
2327#define INIT_AND_WITHP \
2328 assert(!and_withp); \
2329 Newx(and_withp,1,struct regnode_charclass_class); \
2330 SAVEFREEPV(and_withp)
07be1b83 2331
b515a41d
YO
2332/* this is a chain of data about sub patterns we are processing that
2333 need to be handled seperately/specially in study_chunk. Its so
2334 we can simulate recursion without losing state. */
2335struct scan_frame;
2336typedef struct scan_frame {
2337 regnode *last; /* last node to process in this frame */
2338 regnode *next; /* next node to process when last is reached */
2339 struct scan_frame *prev; /*previous frame*/
2340 I32 stop; /* what stopparen do we use */
2341} scan_frame;
2342
76e3520e 2343STATIC I32
40d049e4 2344S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2345 I32 *minlenp, I32 *deltap,
40d049e4
YO
2346 regnode *last,
2347 scan_data_t *data,
2348 I32 stopparen,
2349 U8* recursed,
2350 struct regnode_charclass_class *and_withp,
2351 U32 flags, U32 depth)
c277df42
IZ
2352 /* scanp: Start here (read-write). */
2353 /* deltap: Write maxlen-minlen here. */
2354 /* last: Stop before this one. */
40d049e4
YO
2355 /* data: string data about the pattern */
2356 /* stopparen: treat close N as END */
2357 /* recursed: which subroutines have we recursed into */
2358 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2359{
97aff369 2360 dVAR;
c277df42
IZ
2361 I32 min = 0, pars = 0, code;
2362 regnode *scan = *scanp, *next;
2363 I32 delta = 0;
2364 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2365 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2366 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2367 scan_data_t data_fake;
a3621e74 2368 SV *re_trie_maxbuff = NULL;
786e8c11 2369 regnode *first_non_open = scan;
e2e6a0f1 2370 I32 stopmin = I32_MAX;
8aa23a47
YO
2371 scan_frame *frame = NULL;
2372
a3621e74 2373 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2374
13a24bad 2375#ifdef DEBUGGING
40d049e4 2376 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2377#endif
40d049e4 2378
786e8c11 2379 if ( depth == 0 ) {
40d049e4 2380 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2381 first_non_open=regnext(first_non_open);
2382 }
2383
b81d288d 2384
8aa23a47
YO
2385 fake_study_recurse:
2386 while ( scan && OP(scan) != END && scan < last ){
2387 /* Peephole optimizer: */
2388 DEBUG_STUDYDATA(data,depth);
2389 DEBUG_PEEP("Peep",scan,depth);
2390 JOIN_EXACT(scan,&min,0);
2391
2392 /* Follow the next-chain of the current node and optimize
2393 away all the NOTHINGs from it. */
2394 if (OP(scan) != CURLYX) {
2395 const int max = (reg_off_by_arg[OP(scan)]
2396 ? I32_MAX
2397 /* I32 may be smaller than U16 on CRAYs! */
2398 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2399 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2400 int noff;
2401 regnode *n = scan;
2402
2403 /* Skip NOTHING and LONGJMP. */
2404 while ((n = regnext(n))
2405 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2406 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2407 && off + noff < max)
2408 off += noff;
2409 if (reg_off_by_arg[OP(scan)])
2410 ARG(scan) = off;
2411 else
2412 NEXT_OFF(scan) = off;
2413 }
a3621e74 2414
c277df42 2415
8aa23a47
YO
2416
2417 /* The principal pseudo-switch. Cannot be a switch, since we
2418 look into several different things. */
2419 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2420 || OP(scan) == IFTHEN) {
2421 next = regnext(scan);
2422 code = OP(scan);
2423 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2424
2425 if (OP(next) == code || code == IFTHEN) {
2426 /* NOTE - There is similar code to this block below for handling
2427 TRIE nodes on a re-study. If you change stuff here check there
2428 too. */
2429 I32 max1 = 0, min1 = I32_MAX, num = 0;
2430 struct regnode_charclass_class accum;
2431 regnode * const startbranch=scan;
2432
2433 if (flags & SCF_DO_SUBSTR)
2434 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2435 if (flags & SCF_DO_STCLASS)
2436 cl_init_zero(pRExC_state, &accum);
2437
2438 while (OP(scan) == code) {
2439 I32 deltanext, minnext, f = 0, fake;
2440 struct regnode_charclass_class this_class;
2441
2442 num++;
2443 data_fake.flags = 0;
2444 if (data) {
2445 data_fake.whilem_c = data->whilem_c;
2446 data_fake.last_closep = data->last_closep;
2447 }
2448 else
2449 data_fake.last_closep = &fake;
58e23c8d
YO
2450
2451 data_fake.pos_delta = delta;
8aa23a47
YO
2452 next = regnext(scan);
2453 scan = NEXTOPER(scan);
2454 if (code != BRANCH)
c277df42 2455 scan = NEXTOPER(scan);
8aa23a47
YO
2456 if (flags & SCF_DO_STCLASS) {
2457 cl_init(pRExC_state, &this_class);
2458 data_fake.start_class = &this_class;
2459 f = SCF_DO_STCLASS_AND;
58e23c8d 2460 }
8aa23a47
YO
2461 if (flags & SCF_WHILEM_VISITED_POS)
2462 f |= SCF_WHILEM_VISITED_POS;
2463
2464 /* we suppose the run is continuous, last=next...*/
2465 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2466 next, &data_fake,
2467 stopparen, recursed, NULL, f,depth+1);
2468 if (min1 > minnext)
2469 min1 = minnext;
2470 if (max1 < minnext + deltanext)
2471 max1 = minnext + deltanext;
2472 if (deltanext == I32_MAX)
2473 is_inf = is_inf_internal = 1;
2474 scan = next;
2475 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2476 pars++;
2477 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2478 if ( stopmin > minnext)
2479 stopmin = min + min1;
2480 flags &= ~SCF_DO_SUBSTR;
2481 if (data)
2482 data->flags |= SCF_SEEN_ACCEPT;
2483 }
2484 if (data) {
2485 if (data_fake.flags & SF_HAS_EVAL)
2486 data->flags |= SF_HAS_EVAL;
2487 data->whilem_c = data_fake.whilem_c;
3dab1dad 2488 }
8aa23a47
YO
2489 if (flags & SCF_DO_STCLASS)
2490 cl_or(pRExC_state, &accum, &this_class);
2491 }
2492 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2493 min1 = 0;
2494 if (flags & SCF_DO_SUBSTR) {
2495 data->pos_min += min1;
2496 data->pos_delta += max1 - min1;
2497 if (max1 != min1 || is_inf)
2498 data->longest = &(data->longest_float);
2499 }
2500 min += min1;
2501 delta += max1 - min1;
2502 if (flags & SCF_DO_STCLASS_OR) {
2503 cl_or(pRExC_state, data->start_class, &accum);
2504 if (min1) {
2505 cl_and(data->start_class, and_withp);
2506 flags &= ~SCF_DO_STCLASS;
653099ff 2507 }
8aa23a47
YO
2508 }
2509 else if (flags & SCF_DO_STCLASS_AND) {
2510 if (min1) {
2511 cl_and(data->start_class, &accum);
2512 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2513 }
8aa23a47
YO
2514 else {
2515 /* Switch to OR mode: cache the old value of
2516 * data->start_class */
2517 INIT_AND_WITHP;
2518 StructCopy(data->start_class, and_withp,
2519 struct regnode_charclass_class);
2520 flags &= ~SCF_DO_STCLASS_AND;
2521 StructCopy(&accum, data->start_class,
2522 struct regnode_charclass_class);
2523 flags |= SCF_DO_STCLASS_OR;
2524 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2525 }
8aa23a47 2526 }
a3621e74 2527
8aa23a47
YO
2528 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2529 /* demq.
a3621e74 2530
8aa23a47
YO
2531 Assuming this was/is a branch we are dealing with: 'scan' now
2532 points at the item that follows the branch sequence, whatever
2533 it is. We now start at the beginning of the sequence and look
2534 for subsequences of
a3621e74 2535
8aa23a47
YO
2536 BRANCH->EXACT=>x1
2537 BRANCH->EXACT=>x2
2538 tail
a3621e74 2539
8aa23a47 2540 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2541
8aa23a47
YO
2542 If we can find such a subseqence we need to turn the first
2543 element into a trie and then add the subsequent branch exact
2544 strings to the trie.
a3621e74 2545
8aa23a47 2546 We have two cases
a3621e74 2547
8aa23a47 2548 1. patterns where the whole set of branch can be converted.
a3621e74 2549
8aa23a47 2550 2. patterns where only a subset can be converted.
a3621e74 2551
8aa23a47
YO
2552 In case 1 we can replace the whole set with a single regop
2553 for the trie. In case 2 we need to keep the start and end
2554 branchs so
a3621e74 2555
8aa23a47
YO
2556 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2557 becomes BRANCH TRIE; BRANCH X;
786e8c11 2558
8aa23a47
YO
2559 There is an additional case, that being where there is a
2560 common prefix, which gets split out into an EXACT like node
2561 preceding the TRIE node.
a3621e74 2562
8aa23a47
YO
2563 If x(1..n)==tail then we can do a simple trie, if not we make
2564 a "jump" trie, such that when we match the appropriate word
2565 we "jump" to the appopriate tail node. Essentailly we turn
2566 a nested if into a case structure of sorts.
b515a41d 2567
8aa23a47
YO
2568 */
2569
2570 int made=0;
2571 if (!re_trie_maxbuff) {
2572 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2573 if (!SvIOK(re_trie_maxbuff))
2574 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2575 }
2576 if ( SvIV(re_trie_maxbuff)>=0 ) {
2577 regnode *cur;
2578 regnode *first = (regnode *)NULL;
2579 regnode *last = (regnode *)NULL;
2580 regnode *tail = scan;
2581 U8 optype = 0;
2582 U32 count=0;
a3621e74
YO
2583
2584#ifdef DEBUGGING
8aa23a47 2585 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2586#endif
8aa23a47
YO
2587 /* var tail is used because there may be a TAIL
2588 regop in the way. Ie, the exacts will point to the
2589 thing following the TAIL, but the last branch will
2590 point at the TAIL. So we advance tail. If we
2591 have nested (?:) we may have to move through several
2592 tails.
2593 */
2594
2595 while ( OP( tail ) == TAIL ) {
2596 /* this is the TAIL generated by (?:) */
2597 tail = regnext( tail );
2598 }
a3621e74 2599
8aa23a47
YO
2600
2601 DEBUG_OPTIMISE_r({
2602 regprop(RExC_rx, mysv, tail );
2603 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2604 (int)depth * 2 + 2, "",
2605 "Looking for TRIE'able sequences. Tail node is: ",
2606 SvPV_nolen_const( mysv )
2607 );
2608 });
2609
2610 /*
2611
2612 step through the branches, cur represents each
2613 branch, noper is the first thing to be matched
2614 as part of that branch and noper_next is the
2615 regnext() of that node. if noper is an EXACT
2616 and noper_next is the same as scan (our current
2617 position in the regex) then the EXACT branch is
2618 a possible optimization target. Once we have
2619 two or more consequetive such branches we can
2620 create a trie of the EXACT's contents and stich
2621 it in place. If the sequence represents all of
2622 the branches we eliminate the whole thing and
2623 replace it with a single TRIE. If it is a
2624 subsequence then we need to stitch it in. This
2625 means the first branch has to remain, and needs
2626 to be repointed at the item on the branch chain
2627 following the last branch optimized. This could
2628 be either a BRANCH, in which case the
2629 subsequence is internal, or it could be the
2630 item following the branch sequence in which
2631 case the subsequence is at the end.
2632
2633 */
2634
2635 /* dont use tail as the end marker for this traverse */
2636 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2637 regnode * const noper = NEXTOPER( cur );
b515a41d 2638#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2639 regnode * const noper_next = regnext( noper );
b515a41d
YO
2640#endif
2641
8aa23a47
YO
2642 DEBUG_OPTIMISE_r({
2643 regprop(RExC_rx, mysv, cur);
2644 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2645 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2646
2647 regprop(RExC_rx, mysv, noper);
2648 PerlIO_printf( Perl_debug_log, " -> %s",
2649 SvPV_nolen_const(mysv));
2650
2651 if ( noper_next ) {
2652 regprop(RExC_rx, mysv, noper_next );
2653 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2654 SvPV_nolen_const(mysv));
2655 }
2656 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2657 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2658 });
2659 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2660 : PL_regkind[ OP( noper ) ] == EXACT )
2661 || OP(noper) == NOTHING )
786e8c11 2662#ifdef NOJUMPTRIE
8aa23a47 2663 && noper_next == tail
786e8c11 2664#endif
8aa23a47
YO
2665 && count < U16_MAX)
2666 {
2667 count++;
2668 if ( !first || optype == NOTHING ) {
2669 if (!first) first = cur;
2670 optype = OP( noper );
2671 } else {
2672 last = cur;
2673 }
2674 } else {
2675 if ( last ) {
2676 make_trie( pRExC_state,
2677 startbranch, first, cur, tail, count,
2678 optype, depth+1 );
2679 }
2680 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2681#ifdef NOJUMPTRIE
8aa23a47 2682 && noper_next == tail
786e8c11 2683#endif
8aa23a47
YO
2684 ){
2685 count = 1;
2686 first = cur;
2687 optype = OP( noper );
2688 } else {
2689 count = 0;
2690 first = NULL;
2691 optype = 0;
2692 }
2693 last = NULL;
2694 }
2695 }
2696 DEBUG_OPTIMISE_r({
2697 regprop(RExC_rx, mysv, cur);
2698 PerlIO_printf( Perl_debug_log,
2699 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2700 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2701
2702 });
2703 if ( last ) {
2704 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2705#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2706 if ( ((made == MADE_EXACT_TRIE &&
2707 startbranch == first)
2708 || ( first_non_open == first )) &&
2709 depth==0 ) {
2710 flags |= SCF_TRIE_RESTUDY;
2711 if ( startbranch == first
2712 && scan == tail )
2713 {
2714 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2715 }
2716 }
3dab1dad 2717#endif
8aa23a47
YO
2718 }
2719 }
2720
2721 } /* do trie */
2722
653099ff 2723 }
8aa23a47
YO
2724 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2725 scan = NEXTOPER(NEXTOPER(scan));
2726 } else /* single branch is optimized. */
2727 scan = NEXTOPER(scan);
2728 continue;
2729 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2730 scan_frame *newframe = NULL;
2731 I32 paren;
2732 regnode *start;
2733 regnode *end;
2734
2735 if (OP(scan) != SUSPEND) {
2736 /* set the pointer */
2737 if (OP(scan) == GOSUB) {
2738 paren = ARG(scan);
2739 RExC_recurse[ARG2L(scan)] = scan;
2740 start = RExC_open_parens[paren-1];
2741 end = RExC_close_parens[paren-1];
2742 } else {
2743 paren = 0;
f8fc2ecf 2744 start = RExC_rxi->program + 1;
8aa23a47
YO
2745 end = RExC_opend;
2746 }
2747 if (!recursed) {
2748 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2749 SAVEFREEPV(recursed);
2750 }
2751 if (!PAREN_TEST(recursed,paren+1)) {
2752 PAREN_SET(recursed,paren+1);
2753 Newx(newframe,1,scan_frame);
2754 } else {
2755 if (flags & SCF_DO_SUBSTR) {
2756 scan_commit(pRExC_state,data,minlenp);
2757 data->longest = &(data->longest_float);
2758 }
2759 is_inf = is_inf_internal = 1;
2760 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2761 cl_anything(pRExC_state, data->start_class);
2762 flags &= ~SCF_DO_STCLASS;
2763 }
2764 } else {
2765 Newx(newframe,1,scan_frame);
2766 paren = stopparen;
2767 start = scan+2;
2768 end = regnext(scan);
2769 }
2770 if (newframe) {
2771 assert(start);
2772 assert(end);
2773 SAVEFREEPV(newframe);
2774 newframe->next = regnext(scan);
2775 newframe->last = last;
2776 newframe->stop = stopparen;
2777 newframe->prev = frame;
2778
2779 frame = newframe;
2780 scan = start;
2781 stopparen = paren;
2782 last = end;
2783
2784 continue;
2785 }
2786 }
2787 else if (OP(scan) == EXACT) {
2788 I32 l = STR_LEN(scan);
2789 UV uc;
2790 if (UTF) {
2791 const U8 * const s = (U8*)STRING(scan);
2792 l = utf8_length(s, s + l);
2793 uc = utf8_to_uvchr(s, NULL);
2794 } else {
2795 uc = *((U8*)STRING(scan));
2796 }
2797 min += l;
2798 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2799 /* The code below prefers earlier match for fixed
2800 offset, later match for variable offset. */
2801 if (data->last_end == -1) { /* Update the start info. */
2802 data->last_start_min = data->pos_min;
2803 data->last_start_max = is_inf
2804 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2805 }
8aa23a47
YO
2806 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2807 if (UTF)
2808 SvUTF8_on(data->last_found);
2809 {
2810 SV * const sv = data->last_found;
2811 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2812 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2813 if (mg && mg->mg_len >= 0)
2814 mg->mg_len += utf8_length((U8*)STRING(scan),
2815 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2816 }
8aa23a47
YO
2817 data->last_end = data->pos_min + l;
2818 data->pos_min += l; /* As in the first entry. */
2819 data->flags &= ~SF_BEFORE_EOL;
2820 }
2821 if (flags & SCF_DO_STCLASS_AND) {
2822 /* Check whether it is compatible with what we know already! */
2823 int compat = 1;
2824
2825 if (uc >= 0x100 ||
2826 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2827 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2828 && (!(data->start_class->flags & ANYOF_FOLD)
2829 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2830 )
2831 compat = 0;
2832 ANYOF_CLASS_ZERO(data->start_class);
2833 ANYOF_BITMAP_ZERO(data->start_class);
2834 if (compat)
2835 ANYOF_BITMAP_SET(data->start_class, uc);
2836 data->start_class->flags &= ~ANYOF_EOS;
2837 if (uc < 0x100)
2838 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2839 }
2840 else if (flags & SCF_DO_STCLASS_OR) {
2841 /* false positive possible if the class is case-folded */
2842 if (uc < 0x100)
2843 ANYOF_BITMAP_SET(data->start_class, uc);
2844 else
2845 data->start_class->flags |= ANYOF_UNICODE_ALL;
2846 data->start_class->flags &= ~ANYOF_EOS;
2847 cl_and(data->start_class, and_withp);
2848 }
2849 flags &= ~SCF_DO_STCLASS;
2850 }
2851 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2852 I32 l = STR_LEN(scan);
2853 UV uc = *((U8*)STRING(scan));
2854
2855 /* Search for fixed substrings supports EXACT only. */
2856 if (flags & SCF_DO_SUBSTR) {
2857 assert(data);
2858 scan_commit(pRExC_state, data, minlenp);
2859 }
2860 if (UTF) {
2861 const U8 * const s = (U8 *)STRING(scan);
2862 l = utf8_length(s, s + l);
2863 uc = utf8_to_uvchr(s, NULL);
2864 }
2865 min += l;
2866 if (flags & SCF_DO_SUBSTR)
2867 data->pos_min += l;
2868 if (flags & SCF_DO_STCLASS_AND) {
2869 /* Check whether it is compatible with what we know already! */
2870 int compat = 1;
2871
2872 if (uc >= 0x100 ||
2873 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2874 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2875 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2876 compat = 0;
2877 ANYOF_CLASS_ZERO(data->start_class);
2878 ANYOF_BITMAP_ZERO(data->start_class);
2879 if (compat) {
2880 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2881 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2882 data->start_class->flags |= ANYOF_FOLD;
2883 if (OP(scan) == EXACTFL)
2884 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2885 }
8aa23a47
YO
2886 }
2887 else if (flags & SCF_DO_STCLASS_OR) {
2888 if (data->start_class->flags & ANYOF_FOLD) {
2889 /* false positive possible if the class is case-folded.
2890 Assume that the locale settings are the same... */
1aa99e6b
IH
2891 if (uc < 0x100)
2892 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2893 data->start_class->flags &= ~ANYOF_EOS;
2894 }
8aa23a47 2895 cl_and(data->start_class, and_withp);
653099ff 2896 }
8aa23a47
YO
2897 flags &= ~SCF_DO_STCLASS;
2898 }
2899 else if (strchr((const char*)PL_varies,OP(scan))) {
2900 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2901 I32 f = flags, pos_before = 0;
2902 regnode * const oscan = scan;
2903 struct regnode_charclass_class this_class;
2904 struct regnode_charclass_class *oclass = NULL;
2905 I32 next_is_eval = 0;
2906
2907 switch (PL_regkind[OP(scan)]) {
2908 case WHILEM: /* End of (?:...)* . */
2909 scan = NEXTOPER(scan);
2910 goto finish;
2911 case PLUS:
2912 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2913 next = NEXTOPER(scan);
2914 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2915 mincount = 1;
2916 maxcount = REG_INFTY;
2917 next = regnext(scan);
2918 scan = NEXTOPER(scan);
2919 goto do_curly;
2920 }
2921 }
2922 if (flags & SCF_DO_SUBSTR)
2923 data->pos_min++;
2924 min++;
2925 /* Fall through. */
2926 case STAR:
2927 if (flags & SCF_DO_STCLASS) {
2928 mincount = 0;
2929 maxcount = REG_INFTY;
2930 next = regnext(scan);
2931 scan = NEXTOPER(scan);
2932 goto do_curly;
2933 }
2934 is_inf = is_inf_internal = 1;
2935 scan = regnext(scan);
c277df42 2936 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2937 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2938 data->longest = &(data->longest_float);
c277df42 2939 }
8aa23a47
YO
2940 goto optimize_curly_tail;
2941 case CURLY:
2942 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2943 && (scan->flags == stopparen))
2944 {
2945 mincount = 1;
2946 maxcount = 1;
2947 } else {
2948 mincount = ARG1(scan);
2949 maxcount = ARG2(scan);
653099ff 2950 }
8aa23a47
YO
2951 next = regnext(scan);
2952 if (OP(scan) == CURLYX) {
2953 I32 lp = (data ? *(data->last_closep) : 0);
2954 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2955 }
8aa23a47
YO
2956 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2957 next_is_eval = (OP(scan) == EVAL);
2958 do_curly:
2959 if (flags & SCF_DO_SUBSTR) {
2960 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2961 pos_before = data->pos_min;
b45f050a 2962 }
8aa23a47
YO
2963 if (data) {
2964 fl = data->flags;
2965 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2966 if (is_inf)
2967 data->flags |= SF_IS_INF;
2968 }
2969 if (flags & SCF_DO_STCLASS) {
2970 cl_init(pRExC_state, &this_class);
2971 oclass = data->start_class;
2972 data->start_class = &this_class;
2973 f |= SCF_DO_STCLASS_AND;
2974 f &= ~SCF_DO_STCLASS_OR;
2975 }
2976 /* These are the cases when once a subexpression
2977 fails at a particular position, it cannot succeed
2978 even after backtracking at the enclosing scope.
2979
2980 XXXX what if minimal match and we are at the
2981 initial run of {n,m}? */
2982 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2983 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2984
8aa23a47
YO
2985 /* This will finish on WHILEM, setting scan, or on NULL: */
2986 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2987 last, data, stopparen, recursed, NULL,
2988 (mincount == 0
2989 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2990
8aa23a47
YO
2991 if (flags & SCF_DO_STCLASS)
2992 data->start_class = oclass;
2993 if (mincount == 0 || minnext == 0) {
2994 if (flags & SCF_DO_STCLASS_OR) {
2995 cl_or(pRExC_state, data->start_class, &this_class);
2996 }
2997 else if (flags & SCF_DO_STCLASS_AND) {
2998 /* Switch to OR mode: cache the old value of
2999 * data->start_class */
3000 INIT_AND_WITHP;
3001 StructCopy(data->start_class, and_withp,
3002 struct regnode_charclass_class);
3003 flags &= ~SCF_DO_STCLASS_AND;
3004 StructCopy(&this_class, data->start_class,
3005 struct regnode_charclass_class);
3006 flags |= SCF_DO_STCLASS_OR;
3007 data->start_class->flags |= ANYOF_EOS;
3008 }
3009 } else { /* Non-zero len */
3010 if (flags & SCF_DO_STCLASS_OR) {
3011 cl_or(pRExC_state, data->start_class, &this_class);
3012 cl_and(data->start_class, and_withp);
3013 }
3014 else if (flags & SCF_DO_STCLASS_AND)
3015 cl_and(data->start_class, &this_class);
3016 flags &= ~SCF_DO_STCLASS;
3017 }
3018 if (!scan) /* It was not CURLYX, but CURLY. */
3019 scan = next;
3020 if ( /* ? quantifier ok, except for (?{ ... }) */
3021 (next_is_eval || !(mincount == 0 && maxcount == 1))
3022 && (minnext == 0) && (deltanext == 0)
3023 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3024 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3025 && ckWARN(WARN_REGEXP))
3026 {
3027 vWARN(RExC_parse,
3028 "Quantifier unexpected on zero-length expression");
3029 }
3030
3031 min += minnext * mincount;
3032 is_inf_internal |= ((maxcount == REG_INFTY
3033 && (minnext + deltanext) > 0)
3034 || deltanext == I32_MAX);
3035 is_inf |= is_inf_internal;
3036 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3037
3038 /* Try powerful optimization CURLYX => CURLYN. */
3039 if ( OP(oscan) == CURLYX && data
3040 && data->flags & SF_IN_PAR
3041 && !(data->flags & SF_HAS_EVAL)
3042 && !deltanext && minnext == 1 ) {
3043 /* Try to optimize to CURLYN. */
3044 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3045 regnode * const nxt1 = nxt;
497b47a8 3046#ifdef DEBUGGING
8aa23a47 3047 regnode *nxt2;
497b47a8 3048#endif
c277df42 3049
8aa23a47
YO
3050 /* Skip open. */
3051 nxt = regnext(nxt);
3052 if (!strchr((const char*)PL_simple,OP(nxt))
3053 && !(PL_regkind[OP(nxt)] == EXACT
3054 && STR_LEN(nxt) == 1))
3055 goto nogo;
497b47a8 3056#ifdef DEBUGGING
8aa23a47 3057 nxt2 = nxt;
497b47a8 3058#endif
8aa23a47
YO
3059 nxt = regnext(nxt);
3060 if (OP(nxt) != CLOSE)
3061 goto nogo;
3062 if (RExC_open_parens) {
3063 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3064 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3065 }
3066 /* Now we know that nxt2 is the only contents: */
3067 oscan->flags = (U8)ARG(nxt);
3068 OP(oscan) = CURLYN;
3069 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3070
c277df42 3071#ifdef DEBUGGING
8aa23a47
YO
3072 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3073 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3074 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3075 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3076 OP(nxt + 1) = OPTIMIZED; /* was count. */
3077 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3078#endif
8aa23a47
YO
3079 }
3080 nogo:
3081
3082 /* Try optimization CURLYX => CURLYM. */
3083 if ( OP(oscan) == CURLYX && data
3084 && !(data->flags & SF_HAS_PAR)
3085 && !(data->flags & SF_HAS_EVAL)
3086 && !deltanext /* atom is fixed width */
3087 && minnext != 0 /* CURLYM can't handle zero width */
3088 ) {
3089 /* XXXX How to optimize if data == 0? */
3090 /* Optimize to a simpler form. */
3091 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3092 regnode *nxt2;
3093
3094 OP(oscan) = CURLYM;
3095 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3096 && (OP(nxt2) != WHILEM))
3097 nxt = nxt2;
3098 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3099 /* Need to optimize away parenths. */
3100 if (data->flags & SF_IN_PAR) {
3101 /* Set the parenth number. */
3102 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3103
3104 if (OP(nxt) != CLOSE)
3105 FAIL("Panic opt close");
3106 oscan->flags = (U8)ARG(nxt);
3107 if (RExC_open_parens) {
3108 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3109 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3110 }
8aa23a47
YO
3111 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3112 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3113
c277df42 3114#ifdef DEBUGGING
8aa23a47
YO
3115 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3116 OP(nxt + 1) = OPTIMIZED; /* was count. */
3117 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3118 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3119#endif
c277df42 3120#if 0
8aa23a47
YO
3121 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3122 regnode *nnxt = regnext(nxt1);
3123
3124 if (nnxt == nxt) {
3125 if (reg_off_by_arg[OP(nxt1)])
3126 ARG_SET(nxt1, nxt2 - nxt1);
3127 else if (nxt2 - nxt1 < U16_MAX)
3128 NEXT_OFF(nxt1) = nxt2 - nxt1;
3129 else
3130 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3131 }
8aa23a47 3132 nxt1 = nnxt;
c277df42 3133 }
5d1c421c 3134#endif
8aa23a47
YO
3135 /* Optimize again: */
3136 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3137 NULL, stopparen, recursed, NULL, 0,depth+1);
3138 }
3139 else
3140 oscan->flags = 0;
3141 }
3142 else if ((OP(oscan) == CURLYX)
3143 && (flags & SCF_WHILEM_VISITED_POS)
3144 /* See the comment on a similar expression above.
3145 However, this time it not a subexpression
3146 we care about, but the expression itself. */
3147 && (maxcount == REG_INFTY)
3148 && data && ++data->whilem_c < 16) {
3149 /* This stays as CURLYX, we can put the count/of pair. */
3150 /* Find WHILEM (as in regexec.c) */
3151 regnode *nxt = oscan + NEXT_OFF(oscan);
3152
3153 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3154 nxt += ARG(nxt);
3155 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3156 | (RExC_whilem_seen << 4)); /* On WHILEM */
3157 }
3158 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3159 pars++;
3160 if (flags & SCF_DO_SUBSTR) {
3161 SV *last_str = NULL;
3162 int counted = mincount != 0;
a0ed51b3 3163
8aa23a47
YO
3164 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3165#if defined(SPARC64_GCC_WORKAROUND)
3166 I32 b = 0;
3167 STRLEN l = 0;
3168 const char *s = NULL;
3169 I32 old = 0;
b515a41d 3170
8aa23a47
YO
3171 if (pos_before >= data->last_start_min)
3172 b = pos_before;
3173 else
3174 b = data->last_start_min;
b515a41d 3175
8aa23a47
YO
3176 l = 0;
3177 s = SvPV_const(data->last_found, l);
3178 old = b - data->last_start_min;
3179
3180#else
3181 I32 b = pos_before >= data->last_start_min
3182 ? pos_before : data->last_start_min;
3183 STRLEN l;
3184 const char * const s = SvPV_const(data->last_found, l);
3185 I32 old = b - data->last_start_min;
3186#endif
3187
3188 if (UTF)
3189 old = utf8_hop((U8*)s, old) - (U8*)s;
3190
3191 l -= old;
3192 /* Get the added string: */
3193 last_str = newSVpvn(s + old, l);
3194 if (UTF)
3195 SvUTF8_on(last_str);
3196 if (deltanext == 0 && pos_before == b) {
3197 /* What was added is a constant string */
3198 if (mincount > 1) {
3199 SvGROW(last_str, (mincount * l) + 1);
3200 repeatcpy(SvPVX(last_str) + l,
3201 SvPVX_const(last_str), l, mincount - 1);
3202 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3203 /* Add additional parts. */
3204 SvCUR_set(data->last_found,
3205 SvCUR(data->last_found) - l);
3206 sv_catsv(data->last_found, last_str);
3207 {
3208 SV * sv = data->last_found;
3209 MAGIC *mg =
3210 SvUTF8(sv) && SvMAGICAL(sv) ?
3211 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3212 if (mg && mg->mg_len >= 0)
3213 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3214 }
8aa23a47 3215 data->last_end += l * (mincount - 1);
b515a41d 3216 }
8aa23a47
YO
3217 } else {
3218 /* start offset must point into the last copy */
3219 data->last_start_min += minnext * (mincount - 1);
3220 data->last_start_max += is_inf ? I32_MAX
3221 : (maxcount - 1) * (minnext + data->pos_delta);
3222 }
c277df42 3223 }
8aa23a47
YO
3224 /* It is counted once already... */
3225 data->pos_min += minnext * (mincount - counted);
3226 data->pos_delta += - counted * deltanext +
3227 (minnext + deltanext) * maxcount - minnext * mincount;
3228 if (mincount != maxcount) {
3229 /* Cannot extend fixed substrings found inside
3230 the group. */
3231 scan_commit(pRExC_state,data,minlenp);
3232 if (mincount && last_str) {
3233 SV * const sv = data->last_found;
3234 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3235 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3236
3237 if (mg)
3238 mg->mg_len = -1;
3239 sv_setsv(sv, last_str);
3240 data->last_end = data->pos_min;
3241 data->last_start_min =
3242 data->pos_min - CHR_SVLEN(last_str);
3243 data->last_start_max = is_inf
3244 ? I32_MAX
3245 : data->pos_min + data->pos_delta
3246 - CHR_SVLEN(last_str);
3247 }
3248 data->longest = &(data->longest_float);
3249 }
3250 SvREFCNT_dec(last_str);
c277df42 3251 }
8aa23a47
YO
3252 if (data && (fl & SF_HAS_EVAL))
3253 data->flags |= SF_HAS_EVAL;
3254 optimize_curly_tail:
3255 if (OP(oscan) != CURLYX) {
3256 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3257 && NEXT_OFF(next))
3258 NEXT_OFF(oscan) += NEXT_OFF(next);
3259 }
3260 continue;
3261 default: /* REF and CLUMP only? */
3262 if (flags & SCF_DO_SUBSTR) {
3263 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3264 data->longest = &(data->longest_float);
3265 }
3266 is_inf = is_inf_internal = 1;
3267 if (flags & SCF_DO_STCLASS_OR)
3268 cl_anything(pRExC_state, data->start_class);
3269 flags &= ~SCF_DO_STCLASS;
3270 break;
c277df42 3271 }
8aa23a47
YO
3272 }
3273 else if (strchr((const char*)PL_simple,OP(scan))) {
3274 int value = 0;
653099ff 3275
8aa23a47
YO
3276 if (flags & SCF_DO_SUBSTR) {
3277 scan_commit(pRExC_state,data,minlenp);
3278 data->pos_min++;
3279 }
3280 min++;
3281 if (flags & SCF_DO_STCLASS) {
3282 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3283
8aa23a47
YO
3284 /* Some of the logic below assumes that switching
3285 locale on will only add false positives. */
3286 switch (PL_regkind[OP(scan)]) {
3287 case SANY:
3288 default:
3289 do_default:
3290 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3291 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3292 cl_anything(pRExC_state, data->start_class);
3293 break;
3294 case REG_ANY:
3295 if (OP(scan) == SANY)
3296 goto do_default;
3297 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3298 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3299 || (data->start_class->flags & ANYOF_CLASS));
3300 cl_anything(pRExC_state, data->start_class);
653099ff 3301 }
8aa23a47
YO
3302 if (flags & SCF_DO_STCLASS_AND || !value)
3303 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3304 break;
3305 case ANYOF:
3306 if (flags & SCF_DO_STCLASS_AND)
3307 cl_and(data->start_class,
3308 (struct regnode_charclass_class*)scan);
653099ff 3309 else
8aa23a47
YO
3310 cl_or(pRExC_state, data->start_class,
3311 (struct regnode_charclass_class*)scan);
3312 break;
3313 case ALNUM:
3314 if (flags & SCF_DO_STCLASS_AND) {
3315 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3316 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3317 for (value = 0; value < 256; value++)
3318 if (!isALNUM(value))
3319 ANYOF_BITMAP_CLEAR(data->start_class, value);
3320 }
653099ff 3321 }
8aa23a47
YO
3322 else {
3323 if (data->start_class->flags & ANYOF_LOCALE)
3324 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3325 else {
3326 for (value = 0; value < 256; value++)
3327 if (isALNUM(value))
3328 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3329 }
8aa23a47
YO
3330 }
3331 break;
3332 case ALNUML:
3333 if (flags & SCF_DO_STCLASS_AND) {
3334 if (data->start_class->flags & ANYOF_LOCALE)
3335 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3336 }
3337 else {
3338 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3339 data->start_class->flags |= ANYOF_LOCALE;
3340 }
3341 break;
3342 case NALNUM:
3343 if (flags & SCF_DO_STCLASS_AND) {
3344 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3345 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3346 for (value = 0; value < 256; value++)
3347 if (isALNUM(value))
3348 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3349 }
3350 }
8aa23a47
YO
3351 else {
3352 if (data->start_class->flags & ANYOF_LOCALE)
3353 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3354 else {
3355 for (value = 0; value < 256; value++)
3356 if (!isALNUM(value))
3357 ANYOF_BITMAP_SET(data->start_class, value);
3358 }
653099ff 3359 }
8aa23a47
YO
3360 break;
3361 case NALNUML:
3362 if (flags & SCF_DO_STCLASS_AND) {
3363 if (data->start_class->flags & ANYOF_LOCALE)
3364 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3365 }
8aa23a47
YO
3366 else {
3367 data->start_class->flags |= ANYOF_LOCALE;
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3369 }
3370 break;
3371 case SPACE:
3372 if (flags & SCF_DO_STCLASS_AND) {
3373 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3374 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3375 for (value = 0; value < 256; value++)
3376 if (!isSPACE(value))
3377 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3378 }
3379 }
8aa23a47
YO
3380 else {
3381 if (data->start_class->flags & ANYOF_LOCALE)
3382 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3383 else {
3384 for (value = 0; value < 256; value++)
3385 if (isSPACE(value))
3386 ANYOF_BITMAP_SET(data->start_class, value);
3387 }
653099ff 3388 }
8aa23a47
YO
3389 break;
3390 case SPACEL:
3391 if (flags & SCF_DO_STCLASS_AND) {
3392 if (data->start_class->flags & ANYOF_LOCALE)
3393 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3394 }
3395 else {
3396 data->start_class->flags |= ANYOF_LOCALE;
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3398 }
3399 break;
3400 case NSPACE:
3401 if (flags & SCF_DO_STCLASS_AND) {
3402 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3403 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3404 for (value = 0; value < 256; value++)
3405 if (isSPACE(value))
3406 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3407 }
8aa23a47
YO
3408 }
3409 else {
3410 if (data->start_class->flags & ANYOF_LOCALE)
3411 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3412 else {
3413 for (value = 0; value < 256; value++)
3414 if (!isSPACE(value))
3415 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3416 }
3417 }
8aa23a47
YO
3418 break;
3419 case NSPACEL:
3420 if (flags & SCF_DO_STCLASS_AND) {
3421 if (data->start_class->flags & ANYOF_LOCALE) {
3422 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3423 for (value = 0; value < 256; value++)
3424 if (!isSPACE(value))
3425 ANYOF_BITMAP_CLEAR(data->start_class, value);
3426 }
653099ff 3427 }
8aa23a47
YO
3428 else {
3429 data->start_class->flags |= ANYOF_LOCALE;
3430 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3431 }
3432 break;
3433 case DIGIT:
3434 if (flags & SCF_DO_STCLASS_AND) {
3435 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3436 for (value = 0; value < 256; value++)
3437 if (!isDIGIT(value))
3438 ANYOF_BITMAP_CLEAR(data->start_class, value);
3439 }
3440 else {
3441 if (data->start_class->flags & ANYOF_LOCALE)
3442 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3443 else {
3444 for (value = 0; value < 256; value++)
3445 if (isDIGIT(value))
3446 ANYOF_BITMAP_SET(data->start_class, value);
3447 }
3448 }
3449 break;
3450 case NDIGIT:
3451 if (flags & SCF_DO_STCLASS_AND) {
3452 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3453 for (value = 0; value < 256; value++)
3454 if (isDIGIT(value))
3455 ANYOF_BITMAP_CLEAR(data->start_class, value);
3456 }
3457 else {
3458 if (data->start_class->flags & ANYOF_LOCALE)
3459 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3460 else {
3461 for (value = 0; value < 256; value++)
3462 if (!isDIGIT(value))
3463 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3464 }
3465 }
8aa23a47
YO
3466 break;
3467 }
3468 if (flags & SCF_DO_STCLASS_OR)
3469 cl_and(data->start_class, and_withp);
3470 flags &= ~SCF_DO_STCLASS;
3471 }
3472 }
3473 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3474 data->flags |= (OP(scan) == MEOL
3475 ? SF_BEFORE_MEOL
3476 : SF_BEFORE_SEOL);
3477 }
3478 else if ( PL_regkind[OP(scan)] == BRANCHJ
3479 /* Lookbehind, or need to calculate parens/evals/stclass: */
3480 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3481 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3482 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3483 || OP(scan) == UNLESSM )
3484 {
3485 /* Negative Lookahead/lookbehind
3486 In this case we can't do fixed string optimisation.
3487 */
1de06328 3488
8aa23a47
YO
3489 I32 deltanext, minnext, fake = 0;
3490 regnode *nscan;
3491 struct regnode_charclass_class intrnl;
3492 int f = 0;
1de06328 3493
8aa23a47
YO
3494 data_fake.flags = 0;
3495 if (data) {
3496 data_fake.whilem_c = data->whilem_c;
3497 data_fake.last_closep = data->last_closep;
c277df42 3498 }
8aa23a47
YO
3499 else
3500 data_fake.last_closep = &fake;
58e23c8d 3501 data_fake.pos_delta = delta;
8aa23a47
YO
3502 if ( flags & SCF_DO_STCLASS && !scan->flags
3503 && OP(scan) == IFMATCH ) { /* Lookahead */
3504 cl_init(pRExC_state, &intrnl);
3505 data_fake.start_class = &intrnl;
3506 f |= SCF_DO_STCLASS_AND;
3507 }
3508 if (flags & SCF_WHILEM_VISITED_POS)
3509 f |= SCF_WHILEM_VISITED_POS;
3510 next = regnext(scan);
3511 nscan = NEXTOPER(NEXTOPER(scan));
3512 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3513 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3514 if (scan->flags) {
3515 if (deltanext) {
58e23c8d 3516 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3517 }
3518 else if (minnext > (I32)U8_MAX) {
58e23c8d 3519 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3520 }
3521 scan->flags = (U8)minnext;
3522 }
3523 if (data) {
3524 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3525 pars++;
3526 if (data_fake.flags & SF_HAS_EVAL)
3527 data->flags |= SF_HAS_EVAL;
3528 data->whilem_c = data_fake.whilem_c;
3529 }
3530 if (f & SCF_DO_STCLASS_AND) {
3531 const int was = (data->start_class->flags & ANYOF_EOS);
3532
3533 cl_and(data->start_class, &intrnl);
3534 if (was)
3535 data->start_class->flags |= ANYOF_EOS;
3536 }
cb434fcc 3537 }
8aa23a47
YO
3538#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3539 else {
3540 /* Positive Lookahead/lookbehind
3541 In this case we can do fixed string optimisation,
3542 but we must be careful about it. Note in the case of
3543 lookbehind the positions will be offset by the minimum
3544 length of the pattern, something we won't know about
3545 until after the recurse.
3546 */
3547 I32 deltanext, fake = 0;
3548 regnode *nscan;
3549 struct regnode_charclass_class intrnl;
3550 int f = 0;
3551 /* We use SAVEFREEPV so that when the full compile
3552 is finished perl will clean up the allocated
3553 minlens when its all done. This was we don't
3554 have to worry about freeing them when we know
3555 they wont be used, which would be a pain.
3556 */
3557 I32 *minnextp;
3558 Newx( minnextp, 1, I32 );
3559 SAVEFREEPV(minnextp);
3560
3561 if (data) {
3562 StructCopy(data, &data_fake, scan_data_t);
3563 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3564 f |= SCF_DO_SUBSTR;
3565 if (scan->flags)
3566 scan_commit(pRExC_state, &data_fake,minlenp);
3567 data_fake.last_found=newSVsv(data->last_found);
3568 }
3569 }
3570 else
3571 data_fake.last_closep = &fake;
3572 data_fake.flags = 0;
58e23c8d 3573 data_fake.pos_delta = delta;
8aa23a47
YO
3574 if (is_inf)
3575 data_fake.flags |= SF_IS_INF;
3576 if ( flags & SCF_DO_STCLASS && !scan->flags
3577 && OP(scan) == IFMATCH ) { /* Lookahead */
3578 cl_init(pRExC_state, &intrnl);
3579 data_fake.start_class = &intrnl;
3580 f |= SCF_DO_STCLASS_AND;
3581 }
3582 if (flags & SCF_WHILEM_VISITED_POS)
3583 f |= SCF_WHILEM_VISITED_POS;
3584 next = regnext(scan);
3585 nscan = NEXTOPER(NEXTOPER(scan));
3586
3587 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3588 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3589 if (scan->flags) {
3590 if (deltanext) {
58e23c8d 3591 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3592 }
3593 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3594 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3595 }
3596 scan->flags = (U8)*minnextp;
3597 }
3598
3599 *minnextp += min;
3600
3601 if (f & SCF_DO_STCLASS_AND) {
3602 const int was = (data->start_class->flags & ANYOF_EOS);
3603
3604 cl_and(data->start_class, &intrnl);
3605 if (was)
3606 data->start_class->flags |= ANYOF_EOS;
3607 }
3608 if (data) {
3609 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3610 pars++;
3611 if (data_fake.flags & SF_HAS_EVAL)
3612 data->flags |= SF_HAS_EVAL;
3613 data->whilem_c = data_fake.whilem_c;
3614 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3615 if (RExC_rx->minlen<*minnextp)
3616 RExC_rx->minlen=*minnextp;
3617 scan_commit(pRExC_state, &data_fake, minnextp);
3618 SvREFCNT_dec(data_fake.last_found);
3619
3620 if ( data_fake.minlen_fixed != minlenp )
3621 {
3622 data->offset_fixed= data_fake.offset_fixed;
3623 data->minlen_fixed= data_fake.minlen_fixed;
3624 data->lookbehind_fixed+= scan->flags;
3625 }
3626 if ( data_fake.minlen_float != minlenp )
3627 {
3628 data->minlen_float= data_fake.minlen_float;
3629 data->offset_float_min=data_fake.offset_float_min;
3630 data->offset_float_max=data_fake.offset_float_max;
3631 data->lookbehind_float+= scan->flags;
3632 }
3633 }
3634 }
3635
3636
40d049e4 3637 }
8aa23a47
YO
3638#endif
3639 }
3640 else if (OP(scan) == OPEN) {
3641 if (stopparen != (I32)ARG(scan))
3642 pars++;
3643 }
3644 else if (OP(scan) == CLOSE) {
3645 if (stopparen == (I32)ARG(scan)) {
3646 break;
3647 }
3648 if ((I32)ARG(scan) == is_par) {
3649 next = regnext(scan);
b515a41d 3650
8aa23a47
YO
3651 if ( next && (OP(next) != WHILEM) && next < last)
3652 is_par = 0; /* Disable optimization */
40d049e4 3653 }
8aa23a47
YO
3654 if (data)
3655 *(data->last_closep) = ARG(scan);
3656 }
3657 else if (OP(scan) == EVAL) {
c277df42
IZ
3658 if (data)
3659 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3660 }
3661 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3662 if (flags & SCF_DO_SUBSTR) {
3663 scan_commit(pRExC_state,data,minlenp);
3664 flags &= ~SCF_DO_SUBSTR;
40d049e4 3665 }
8aa23a47
YO
3666 if (data && OP(scan)==ACCEPT) {
3667 data->flags |= SCF_SEEN_ACCEPT;
3668 if (stopmin > min)
3669 stopmin = min;
e2e6a0f1 3670 }
8aa23a47
YO
3671 }
3672 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3673 {
0f5d15d6 3674 if (flags & SCF_DO_SUBSTR) {
1de06328 3675 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3676 data->longest = &(data->longest_float);
3677 }
3678 is_inf = is_inf_internal = 1;
653099ff 3679 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3680 cl_anything(pRExC_state, data->start_class);
96776eda 3681 flags &= ~SCF_DO_STCLASS;
8aa23a47 3682 }
58e23c8d 3683 else if (OP(scan) == GPOS) {
bbe252da 3684 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3685 !(delta || is_inf || (data && data->pos_delta)))
3686 {
bbe252da
YO
3687 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3688 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3689 if (RExC_rx->gofs < (U32)min)
3690 RExC_rx->gofs = min;
3691 } else {
bbe252da 3692 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3693 RExC_rx->gofs = 0;
3694 }
3695 }
786e8c11 3696#ifdef TRIE_STUDY_OPT
40d049e4 3697#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3698 else if (PL_regkind[OP(scan)] == TRIE) {
3699 /* NOTE - There is similar code to this block above for handling
3700 BRANCH nodes on the initial study. If you change stuff here
3701 check there too. */
3702 regnode *trie_node= scan;
3703 regnode *tail= regnext(scan);
f8fc2ecf 3704 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3705 I32 max1 = 0, min1 = I32_MAX;
3706 struct regnode_charclass_class accum;
3707
3708 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3709 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3710 if (flags & SCF_DO_STCLASS)
3711 cl_init_zero(pRExC_state, &accum);
3712
3713 if (!trie->jump) {
3714 min1= trie->minlen;
3715 max1= trie->maxlen;
3716 } else {
3717 const regnode *nextbranch= NULL;
3718 U32 word;
3719
3720 for ( word=1 ; word <= trie->wordcount ; word++)
3721 {
3722 I32 deltanext=0, minnext=0, f = 0, fake;
3723 struct regnode_charclass_class this_class;
3724
3725 data_fake.flags = 0;
3726 if (data) {
3727 data_fake.whilem_c = data->whilem_c;
3728 data_fake.last_closep = data->last_closep;
3729 }
3730 else
3731 data_fake.last_closep = &fake;
58e23c8d 3732 data_fake.pos_delta = delta;
8aa23a47
YO
3733 if (flags & SCF_DO_STCLASS) {
3734 cl_init(pRExC_state, &this_class);
3735 data_fake.start_class = &this_class;
3736 f = SCF_DO_STCLASS_AND;
3737 }
3738 if (flags & SCF_WHILEM_VISITED_POS)
3739 f |= SCF_WHILEM_VISITED_POS;
3740
3741 if (trie->jump[word]) {
3742 if (!nextbranch)
3743 nextbranch = trie_node + trie->jump[0];
3744 scan= trie_node + trie->jump[word];
3745 /* We go from the jump point to the branch that follows
3746 it. Note this means we need the vestigal unused branches
3747 even though they arent otherwise used.
3748 */
3749 minnext = study_chunk(pRExC_state, &scan, minlenp,
3750 &deltanext, (regnode *)nextbranch, &data_fake,
3751 stopparen, recursed, NULL, f,depth+1);
3752 }
3753 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3754 nextbranch= regnext((regnode*)nextbranch);
3755
3756 if (min1 > (I32)(minnext + trie->minlen))
3757 min1 = minnext + trie->minlen;
3758 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3759 max1 = minnext + deltanext + trie->maxlen;
3760 if (deltanext == I32_MAX)
3761 is_inf = is_inf_internal = 1;
3762
3763 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3764 pars++;
3765 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3766 if ( stopmin > min + min1)
3767 stopmin = min + min1;
3768 flags &= ~SCF_DO_SUBSTR;
3769 if (data)
3770 data->flags |= SCF_SEEN_ACCEPT;
3771 }
3772 if (data) {
3773 if (data_fake.flags & SF_HAS_EVAL)
3774 data->flags |= SF_HAS_EVAL;
3775 data->whilem_c = data_fake.whilem_c;
3776 }
3777 if (flags & SCF_DO_STCLASS)
3778 cl_or(pRExC_state, &accum, &this_class);
3779 }
3780 }
3781 if (flags & SCF_DO_SUBSTR) {
3782 data->pos_min += min1;
3783 data->pos_delta += max1 - min1;
3784 if (max1 != min1 || is_inf)
3785 data->longest = &(data->longest_float);
3786 }
3787 min += min1;
3788 delta += max1 - min1;
3789 if (flags & SCF_DO_STCLASS_OR) {
3790 cl_or(pRExC_state, data->start_class, &accum);
3791 if (min1) {
3792 cl_and(data->start_class, and_withp);
3793 flags &= ~SCF_DO_STCLASS;
3794 }
3795 }
3796 else if (flags & SCF_DO_STCLASS_AND) {
3797 if (min1) {
3798 cl_and(data->start_class, &accum);
3799 flags &= ~SCF_DO_STCLASS;
3800 }
3801 else {
3802 /* Switch to OR mode: cache the old value of
3803 * data->start_class */
3804 INIT_AND_WITHP;
3805 StructCopy(data->start_class, and_withp,
3806 struct regnode_charclass_class);
3807 flags &= ~SCF_DO_STCLASS_AND;
3808 StructCopy(&accum, data->start_class,
3809 struct regnode_charclass_class);
3810 flags |= SCF_DO_STCLASS_OR;
3811 data->start_class->flags |= ANYOF_EOS;
3812 }
3813 }
3814 scan= tail;
3815 continue;
3816 }
786e8c11 3817#else
8aa23a47 3818 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3819 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3820 U8*bang=NULL;
3821
3822 min += trie->minlen;
3823 delta += (trie->maxlen - trie->minlen);
3824 flags &= ~SCF_DO_STCLASS; /* xxx */
3825 if (flags & SCF_DO_SUBSTR) {
3826 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3827 data->pos_min += trie->minlen;
3828 data->pos_delta += (trie->maxlen - trie->minlen);
3829 if (trie->maxlen != trie->minlen)
3830 data->longest = &(data->longest_float);
3831 }
3832 if (trie->jump) /* no more substrings -- for now /grr*/
3833 flags &= ~SCF_DO_SUBSTR;
b515a41d 3834 }
8aa23a47
YO
3835#endif /* old or new */
3836#endif /* TRIE_STUDY_OPT */
3837 /* Else: zero-length, ignore. */
3838 scan = regnext(scan);
3839 }
3840 if (frame) {
3841 last = frame->last;
3842 scan = frame->next;
3843 stopparen = frame->stop;
3844 frame = frame->prev;
3845 goto fake_study_recurse;
c277df42
IZ
3846 }
3847
3848 finish:
8aa23a47
YO
3849 assert(!frame);
3850
c277df42 3851 *scanp = scan;
aca2d497 3852 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3853 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3854 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3855 if (is_par > (I32)U8_MAX)
c277df42
IZ
3856 is_par = 0;
3857 if (is_par && pars==1 && data) {
3858 data->flags |= SF_IN_PAR;
3859 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3860 }
3861 else if (pars && data) {
c277df42
IZ
3862 data->flags |= SF_HAS_PAR;
3863 da