This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Stage 1 of threadsafe-ing the trie strucutres - use shared malloc.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
c74340f9 159#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 160#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 165#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
166#define RExC_open_parens (pRExC_state->open_parens)
167#define RExC_close_parens (pRExC_state->close_parens)
168#define RExC_opend (pRExC_state->opend)
81714fb9 169#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
170#define RExC_recurse (pRExC_state->recurse)
171#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 172
a687059c
LW
173#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
a687059c 176
35c8bce7
LW
177#ifdef SPSTART
178#undef SPSTART /* dratted cpp namespace... */
179#endif
a687059c
LW
180/*
181 * Flags to be passed up and down.
182 */
a687059c 183#define WORST 0 /* Worst case. */
821b33a5 184#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
185#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186#define SPSTART 0x4 /* Starts with * or +. */
187#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 188
3dab1dad
YO
189#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
190
07be1b83
YO
191/* whether trie related optimizations are enabled */
192#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193#define TRIE_STUDY_OPT
786e8c11 194#define FULL_TRIE_STUDY
07be1b83
YO
195#define TRIE_STCLASS
196#endif
1de06328
YO
197
198
40d049e4
YO
199
200#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201#define PBITVAL(paren) (1 << ((paren) & 7))
202#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205
206
1de06328
YO
207/* About scan_data_t.
208
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
215 in the pattern:
216
217 /FOO[xX]A.*B[xX]BAR/
218
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
222
223 The strings can be composites, for instance
224
225 /(f)(o)(o)/
226
227 will result in a composite fixed substring 'foo'.
228
229 For each string some basic information is maintained:
230
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
237 found.
238
239 - max_offset
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
243
244 - minlenp
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
248 involved. Consider
249
250 /(?=FOO).*F/
251
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
262
263 - lookbehind
264
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
275 associated string.
276
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
279
280*/
2c2d71f5
JH
281
282typedef struct scan_data_t {
1de06328
YO
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
2c2d71f5
JH
285 I32 pos_min;
286 I32 pos_delta;
287 SV *last_found;
1de06328 288 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
289 I32 last_start_min;
290 I32 last_start_max;
1de06328
YO
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
301 I32 flags;
302 I32 whilem_c;
cb434fcc 303 I32 *last_closep;
653099ff 304 struct regnode_charclass_class *start_class;
2c2d71f5
JH
305} scan_data_t;
306
a687059c 307/*
e50aee73 308 * Forward declarations for pregcomp()'s friends.
a687059c 309 */
a0d0e21e 310
27da23d5 311static const scan_data_t zero_scan_data =
1de06328 312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
313
314#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
315#define SF_BEFORE_SEOL 0x0001
316#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
317#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319
09b7f37c
CB
320#ifdef NO_UNARY_PLUS
321# define SF_FIX_SHIFT_EOL (0+2)
322# define SF_FL_SHIFT_EOL (0+4)
323#else
324# define SF_FIX_SHIFT_EOL (+2)
325# define SF_FL_SHIFT_EOL (+4)
326#endif
c277df42
IZ
327
328#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
330
331#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
333#define SF_IS_INF 0x0040
334#define SF_HAS_PAR 0x0080
335#define SF_IN_PAR 0x0100
336#define SF_HAS_EVAL 0x0200
337#define SCF_DO_SUBSTR 0x0400
653099ff
GS
338#define SCF_DO_STCLASS_AND 0x0800
339#define SCF_DO_STCLASS_OR 0x1000
340#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 341#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 342
786e8c11 343#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 344#define SCF_SEEN_ACCEPT 0x8000
07be1b83 345
eb160463 346#define UTF (RExC_utf8 != 0)
bbe252da
YO
347#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 349
ffc61ed2 350#define OOB_UNICODE 12345678
93733859 351#define OOB_NAMEDCLASS -1
b8c5462f 352
a0ed51b3
LW
353#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355
8615cb43 356
b45f050a
JF
357/* length of regex to show in messages that don't mark a position within */
358#define RegexLengthToShowInErrorMessages 127
359
360/*
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
364 */
7253e4e3
RK
365#define MARKER1 "<-- HERE" /* marker as it appears in the description */
366#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 367
7253e4e3 368#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
369
370/*
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 * "...".
374 */
58e23c8d 375#define _FAIL(code) STMT_START { \
bfed75c6 376 const char *ellipses = ""; \
ccb2c380
MP
377 IV len = RExC_end - RExC_precomp; \
378 \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
384 ellipses = "..."; \
385 } \
58e23c8d 386 code; \
ccb2c380 387} STMT_END
8615cb43 388
58e23c8d
YO
389#define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
392
393#define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
396
b45f050a 397/*
b45f050a
JF
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
399 */
ccb2c380 400#define Simple_vFAIL(m) STMT_START { \
a28509cc 401 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
408 */
ccb2c380
MP
409#define vFAIL(m) STMT_START { \
410 if (!SIZE_ONLY) \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
412 Simple_vFAIL(m); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Like Simple_vFAIL(), but accepts two arguments.
417 */
ccb2c380 418#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 419 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
422} STMT_END
b45f050a
JF
423
424/*
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
426 */
ccb2c380
MP
427#define vFAIL2(m,a1) STMT_START { \
428 if (!SIZE_ONLY) \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
431} STMT_END
b45f050a
JF
432
433
434/*
435 * Like Simple_vFAIL(), but accepts three arguments.
436 */
ccb2c380 437#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 438 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
445 */
ccb2c380
MP
446#define vFAIL3(m,a1,a2) STMT_START { \
447 if (!SIZE_ONLY) \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
450} STMT_END
b45f050a
JF
451
452/*
453 * Like Simple_vFAIL(), but accepts four arguments.
454 */
ccb2c380 455#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 456 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
b45f050a 460
ccb2c380 461#define vWARN(loc,m) STMT_START { \
a28509cc 462 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467#define vWARNdep(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474
475#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
9d1d55b5 498
8615cb43 499
cd439c50 500/* Allow for side effects in s */
ccb2c380
MP
501#define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503} STMT_END
cd439c50 504
fac92740
MJD
505/* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
07be1b83 509 * Position is 1 indexed.
fac92740
MJD
510 */
511
ccb2c380
MP
512#define Set_Node_Offset_To_R(node,byte) STMT_START { \
513 if (! SIZE_ONLY) { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 515 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 516 if((node) < 0) { \
551405c4 517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
518 } else { \
519 RExC_offsets[2*(node)-1] = (byte); \
520 } \
521 } \
522} STMT_END
523
524#define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
527
528#define Set_Node_Length_To_R(node,len) STMT_START { \
529 if (! SIZE_ONLY) { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 531 __LINE__, (int)(node), (int)(len))); \
ccb2c380 532 if((node) < 0) { \
551405c4 533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
534 } else { \
535 RExC_offsets[2*(node)] = (len); \
536 } \
537 } \
538} STMT_END
539
540#define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543#define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
545
546/* Get offsets and lengths */
547#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
549
07be1b83
YO
550#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
553} STMT_END
554
555
556#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557#define EXPERIMENTAL_INPLACESCAN
558#endif
559
1de06328 560#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 561DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 562 PerlIO_printf(Perl_debug_log, \
b515a41d 563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
1de06328 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3
LW
617 }
618 else {
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
9051bda5
HS
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
1de06328 645 DEBUG_STUDYDATA(data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
3dab1dad
YO
801 dump_trie(trie)
802 dump_trie_interim_list(trie,next_alloc)
803 dump_trie_interim_table(trie,next_alloc)
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
816 dump_trie(trie)
817 Dumps the final compressed table form of the trie to Perl_debug_log.
818 Used for debugging make_trie().
819*/
820
821STATIC void
822S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
823{
824 U32 state;
ab3bbdeb
YO
825 SV *sv=sv_newmortal();
826 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 835 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
897 dump_trie_interim_list(trie,next_alloc)
898 Dumps a fully constructed but uncompressed trie in list form.
899 List tries normally only are used for construction when the number of
900 possible chars (trie->uniquecharcount) is very high.
901 Used for debugging make_trie().
902*/
903STATIC void
904S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
905{
906 U32 state;
ab3bbdeb
YO
907 SV *sv=sv_newmortal();
908 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
913 "------:-----+-----------------\n" );
3dab1dad
YO
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
ab3bbdeb 918 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 928 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
929 if ( tmp ) {
930 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
931 colwidth,
ddc5bc0f 932 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
933 PL_colors[0], PL_colors[1],
934 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
935 PERL_PV_ESCAPE_FIRSTCHAR
936 ) ,
1e2e3d02
YO
937 TRIE_LIST_ITEM(state,charid).forid,
938 (UV)TRIE_LIST_ITEM(state,charid).newstate
939 );
940 if (!(charid % 10))
664e119d
RGS
941 PerlIO_printf(Perl_debug_log, "\n%*s| ",
942 (int)((depth * 2) + 14), "");
1e2e3d02 943 }
ab3bbdeb
YO
944 }
945 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
946 }
947}
948
949/*
950 dump_trie_interim_table(trie,next_alloc)
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
958{
959 U32 state;
960 U16 charid;
ab3bbdeb
YO
961 SV *sv=sv_newmortal();
962 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
963 GET_RE_DEBUG_FLAGS_DECL;
964
965 /*
966 print out the table precompression so that we can do a visual check
967 that they are identical.
968 */
969
970 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
971
972 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 973 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 974 if ( tmp ) {
ab3bbdeb
YO
975 PerlIO_printf( Perl_debug_log, "%*s",
976 colwidth,
ddc5bc0f 977 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
978 PL_colors[0], PL_colors[1],
979 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
980 PERL_PV_ESCAPE_FIRSTCHAR
981 )
982 );
3dab1dad
YO
983 }
984 }
985
986 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
987
988 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 989 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n" );
993
994 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
995
996 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
997 (int)depth * 2 + 2,"",
998 (UV)TRIE_NODENUM( state ) );
999
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1001 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1002 if (v)
1003 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1004 else
1005 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1006 }
1007 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1008 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1009 } else {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1011 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1012 }
1013 }
07be1b83 1014}
3dab1dad
YO
1015
1016#endif
1017
786e8c11
YO
1018/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1019 startbranch: the first branch in the whole branch sequence
1020 first : start branch of sequence of branch-exact nodes.
1021 May be the same as startbranch
1022 last : Thing following the last branch.
1023 May be the same as tail.
1024 tail : item following the branch sequence
1025 count : words in the sequence
1026 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1027 depth : indent depth
3dab1dad 1028
786e8c11 1029Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1030
786e8c11
YO
1031A trie is an N'ary tree where the branches are determined by digital
1032decomposition of the key. IE, at the root node you look up the 1st character and
1033follow that branch repeat until you find the end of the branches. Nodes can be
1034marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1035
786e8c11 1036 /he|she|his|hers/
72f13be8 1037
786e8c11
YO
1038would convert into the following structure. Numbers represent states, letters
1039following numbers represent valid transitions on the letter from that state, if
1040the number is in square brackets it represents an accepting state, otherwise it
1041will be in parenthesis.
07be1b83 1042
786e8c11
YO
1043 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1044 | |
1045 | (2)
1046 | |
1047 (1) +-i->(6)-+-s->[7]
1048 |
1049 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1050
786e8c11
YO
1051 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1052
1053This shows that when matching against the string 'hers' we will begin at state 1
1054read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1055then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1056is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1057single traverse. We store a mapping from accepting to state to which word was
1058matched, and then when we have multiple possibilities we try to complete the
1059rest of the regex in the order in which they occured in the alternation.
1060
1061The only prior NFA like behaviour that would be changed by the TRIE support is
1062the silent ignoring of duplicate alternations which are of the form:
1063
1064 / (DUPE|DUPE) X? (?{ ... }) Y /x
1065
1066Thus EVAL blocks follwing a trie may be called a different number of times with
1067and without the optimisation. With the optimisations dupes will be silently
1068ignored. This inconsistant behaviour of EVAL type nodes is well established as
1069the following demonstrates:
1070
1071 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1072
1073which prints out 'word' three times, but
1074
1075 'words'=~/(word|word|word)(?{ print $1 })S/
1076
1077which doesnt print it out at all. This is due to other optimisations kicking in.
1078
1079Example of what happens on a structural level:
1080
1081The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1082
1083 1: CURLYM[1] {1,32767}(18)
1084 5: BRANCH(8)
1085 6: EXACT <ac>(16)
1086 8: BRANCH(11)
1087 9: EXACT <ad>(16)
1088 11: BRANCH(14)
1089 12: EXACT <ab>(16)
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094This would be optimizable with startbranch=5, first=5, last=16, tail=16
1095and should turn into:
1096
1097 1: CURLYM[1] {1,32767}(18)
1098 5: TRIE(16)
1099 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1100 <ac>
1101 <ad>
1102 <ab>
1103 16: SUCCEED(0)
1104 17: NOTHING(18)
1105 18: END(0)
1106
1107Cases where tail != last would be like /(?foo|bar)baz/:
1108
1109 1: BRANCH(4)
1110 2: EXACT <foo>(8)
1111 4: BRANCH(7)
1112 5: EXACT <bar>(8)
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117which would be optimizable with startbranch=1, first=1, last=7, tail=8
1118and would end up looking like:
1119
1120 1: TRIE(8)
1121 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1122 <foo>
1123 <bar>
1124 7: TAIL(8)
1125 8: EXACT <baz>(10)
1126 10: END(0)
1127
1128 d = uvuni_to_utf8_flags(d, uv, 0);
1129
1130is the recommended Unicode-aware way of saying
1131
1132 *(d++) = uv;
1133*/
1134
1e2e3d02 1135#define TRIE_STORE_REVCHAR \
786e8c11 1136 STMT_START { \
1e2e3d02 1137 SV *tmp = newSVpvs(""); \
786e8c11 1138 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1139 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1140 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1141 } STMT_END
1142
1143#define TRIE_READ_CHAR STMT_START { \
1144 wordlen++; \
1145 if ( UTF ) { \
1146 if ( folder ) { \
1147 if ( foldlen > 0 ) { \
1148 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1149 foldlen -= len; \
1150 scan += len; \
1151 len = 0; \
1152 } else { \
1153 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1154 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 scan = foldbuf + UNISKIP( uvc ); \
1157 } \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 } \
1161 } else { \
1162 uvc = (U32)*uc; \
1163 len = 1; \
1164 } \
1165} STMT_END
1166
1167
1168
1169#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1170 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1171 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1172 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1173 } \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1175 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1176 TRIE_LIST_CUR( state )++; \
1177} STMT_END
07be1b83 1178
786e8c11
YO
1179#define TRIE_LIST_NEW(state) STMT_START { \
1180 Newxz( trie->states[ state ].trans.list, \
1181 4, reg_trie_trans_le ); \
1182 TRIE_LIST_CUR( state ) = 1; \
1183 TRIE_LIST_LEN( state ) = 4; \
1184} STMT_END
07be1b83 1185
786e8c11
YO
1186#define TRIE_HANDLE_WORD(state) STMT_START { \
1187 U16 dupe= trie->states[ state ].wordnum; \
1188 regnode * const noper_next = regnext( noper ); \
1189 \
1190 if (trie->wordlen) \
1191 trie->wordlen[ curword ] = wordlen; \
1192 DEBUG_r({ \
1193 /* store the word for dumping */ \
1194 SV* tmp; \
1195 if (OP(noper) != NOTHING) \
1196 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1197 else \
1198 tmp = newSVpvn( "", 0 ); \
1199 if ( UTF ) SvUTF8_on( tmp ); \
1200 av_push( trie->words, tmp ); \
1201 }); \
1202 \
1203 curword++; \
1204 \
1205 if ( noper_next < tail ) { \
1206 if (!trie->jump) \
446bd890 1207 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1208 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1209 if (!jumper) \
1210 jumper = noper_next; \
1211 if (!nextbranch) \
1212 nextbranch= regnext(cur); \
1213 } \
1214 \
1215 if ( dupe ) { \
1216 /* So it's a dupe. This means we need to maintain a */\
1217 /* linked-list from the first to the next. */\
1218 /* we only allocate the nextword buffer when there */\
1219 /* a dupe, so first time we have to do the allocation */\
1220 if (!trie->nextword) \
446bd890
NC
1221 trie->nextword = \
1222 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1223 while ( trie->nextword[dupe] ) \
1224 dupe= trie->nextword[dupe]; \
1225 trie->nextword[dupe]= curword; \
1226 } else { \
1227 /* we haven't inserted this word yet. */ \
1228 trie->states[ state ].wordnum = curword; \
1229 } \
1230} STMT_END
07be1b83 1231
3dab1dad 1232
786e8c11
YO
1233#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1234 ( ( base + charid >= ucharcount \
1235 && base + charid < ubound \
1236 && state == trie->trans[ base - ucharcount + charid ].check \
1237 && trie->trans[ base - ucharcount + charid ].next ) \
1238 ? trie->trans[ base - ucharcount + charid ].next \
1239 : ( state==1 ? special : 0 ) \
1240 )
3dab1dad 1241
786e8c11
YO
1242#define MADE_TRIE 1
1243#define MADE_JUMP_TRIE 2
1244#define MADE_EXACT_TRIE 4
3dab1dad 1245
a3621e74 1246STATIC I32
786e8c11 1247S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1248{
27da23d5 1249 dVAR;
a3621e74
YO
1250 /* first pass, loop through and scan words */
1251 reg_trie_data *trie;
1252 regnode *cur;
9f7f3913 1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1254 STRLEN len = 0;
1255 UV uvc = 0;
1256 U16 curword = 0;
1257 U32 next_alloc = 0;
786e8c11
YO
1258 regnode *jumper = NULL;
1259 regnode *nextbranch = NULL;
7f69552c 1260 regnode *convert = NULL;
a3621e74 1261 /* we just use folder as a flag in utf8 */
e1ec3a88 1262 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1263 ? PL_fold
1264 : ( flags == EXACTFL
1265 ? PL_fold_locale
1266 : NULL
1267 )
1268 );
1269
e1ec3a88 1270 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1271 SV *re_trie_maxbuff;
3dab1dad
YO
1272#ifndef DEBUGGING
1273 /* these are only used during construction but are useful during
8e11feef 1274 * debugging so we store them in the struct when debugging.
8e11feef 1275 */
3dab1dad 1276 STRLEN trie_charcount=0;
3dab1dad
YO
1277 AV *trie_revcharmap;
1278#endif
a3621e74 1279 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1280#ifndef DEBUGGING
1281 PERL_UNUSED_ARG(depth);
1282#endif
a3621e74 1283
446bd890 1284 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1285 trie->refcount = 1;
3dab1dad 1286 trie->startstate = 1;
786e8c11 1287 trie->wordcount = word_count;
f8fc2ecf 1288 RExC_rxi->data->data[ data_slot ] = (void*)trie;
446bd890 1289 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1290 if (!(UTF && folder))
446bd890 1291 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74
YO
1292 DEBUG_r({
1293 trie->words = newAV();
a3621e74 1294 });
3dab1dad 1295 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1296
0111c4fd 1297 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1298 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1299 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1300 }
3dab1dad
YO
1301 DEBUG_OPTIMISE_r({
1302 PerlIO_printf( Perl_debug_log,
786e8c11 1303 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1304 (int)depth * 2 + 2, "",
1305 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1306 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1307 (int)depth);
3dab1dad 1308 });
7f69552c
YO
1309
1310 /* Find the node we are going to overwrite */
1311 if ( first == startbranch && OP( last ) != BRANCH ) {
1312 /* whole branch chain */
1313 convert = first;
1314 } else {
1315 /* branch sub-chain */
1316 convert = NEXTOPER( first );
1317 }
1318
a3621e74
YO
1319 /* -- First loop and Setup --
1320
1321 We first traverse the branches and scan each word to determine if it
1322 contains widechars, and how many unique chars there are, this is
1323 important as we have to build a table with at least as many columns as we
1324 have unique chars.
1325
1326 We use an array of integers to represent the character codes 0..255
1327 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1328 native representation of the character value as the key and IV's for the
1329 coded index.
1330
1331 *TODO* If we keep track of how many times each character is used we can
1332 remap the columns so that the table compression later on is more
1333 efficient in terms of memory by ensuring most common value is in the
1334 middle and the least common are on the outside. IMO this would be better
1335 than a most to least common mapping as theres a decent chance the most
1336 common letter will share a node with the least common, meaning the node
1337 will not be compressable. With a middle is most common approach the worst
1338 case is when we have the least common nodes twice.
1339
1340 */
1341
a3621e74 1342 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1343 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1344 const U8 *uc = (U8*)STRING( noper );
a28509cc 1345 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1346 STRLEN foldlen = 0;
1347 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1348 const U8 *scan = (U8*)NULL;
07be1b83 1349 U32 wordlen = 0; /* required init */
3dab1dad 1350 STRLEN chars=0;
a3621e74 1351
3dab1dad
YO
1352 if (OP(noper) == NOTHING) {
1353 trie->minlen= 0;
1354 continue;
1355 }
1356 if (trie->bitmap) {
1357 TRIE_BITMAP_SET(trie,*uc);
1358 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1359 }
a3621e74 1360 for ( ; uc < e ; uc += len ) {
3dab1dad 1361 TRIE_CHARCOUNT(trie)++;
a3621e74 1362 TRIE_READ_CHAR;
3dab1dad 1363 chars++;
a3621e74
YO
1364 if ( uvc < 256 ) {
1365 if ( !trie->charmap[ uvc ] ) {
1366 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1367 if ( folder )
1368 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 } else {
1372 SV** svpp;
1373 if ( !trie->widecharmap )
1374 trie->widecharmap = newHV();
1375
1376 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1377
1378 if ( !svpp )
e4584336 1379 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1380
1381 if ( !SvTRUE( *svpp ) ) {
1382 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1383 TRIE_STORE_REVCHAR;
a3621e74
YO
1384 }
1385 }
1386 }
3dab1dad
YO
1387 if( cur == first ) {
1388 trie->minlen=chars;
1389 trie->maxlen=chars;
1390 } else if (chars < trie->minlen) {
1391 trie->minlen=chars;
1392 } else if (chars > trie->maxlen) {
1393 trie->maxlen=chars;
1394 }
1395
a3621e74
YO
1396 } /* end first pass */
1397 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1398 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1399 (int)depth * 2 + 2,"",
85c3142d 1400 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1401 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1402 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1403 );
446bd890 1404 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1405
1406 /*
1407 We now know what we are dealing with in terms of unique chars and
1408 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1409 representation using a flat table will take. If it's over a reasonable
1410 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1411 conservative but potentially much slower representation using an array
1412 of lists.
1413
1414 At the end we convert both representations into the same compressed
1415 form that will be used in regexec.c for matching with. The latter
1416 is a form that cannot be used to construct with but has memory
1417 properties similar to the list form and access properties similar
1418 to the table form making it both suitable for fast searches and
1419 small enough that its feasable to store for the duration of a program.
1420
1421 See the comment in the code where the compressed table is produced
1422 inplace from the flat tabe representation for an explanation of how
1423 the compression works.
1424
1425 */
1426
1427
3dab1dad 1428 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1429 /*
1430 Second Pass -- Array Of Lists Representation
1431
1432 Each state will be represented by a list of charid:state records
1433 (reg_trie_trans_le) the first such element holds the CUR and LEN
1434 points of the allocated array. (See defines above).
1435
1436 We build the initial structure using the lists, and then convert
1437 it into the compressed table form which allows faster lookups
1438 (but cant be modified once converted).
a3621e74
YO
1439 */
1440
a3621e74
YO
1441 STRLEN transcount = 1;
1442
1e2e3d02
YO
1443 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1444 "%*sCompiling trie using list compiler\n",
1445 (int)depth * 2 + 2, ""));
446bd890
NC
1446
1447 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1448 sizeof(reg_trie_state) );
a3621e74
YO
1449 TRIE_LIST_NEW(1);
1450 next_alloc = 2;
1451
1452 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1453
c445ea15
AL
1454 regnode * const noper = NEXTOPER( cur );
1455 U8 *uc = (U8*)STRING( noper );
1456 const U8 * const e = uc + STR_LEN( noper );
1457 U32 state = 1; /* required init */
1458 U16 charid = 0; /* sanity init */
1459 U8 *scan = (U8*)NULL; /* sanity init */
1460 STRLEN foldlen = 0; /* required init */
07be1b83 1461 U32 wordlen = 0; /* required init */
c445ea15
AL
1462 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1463
3dab1dad 1464 if (OP(noper) != NOTHING) {
786e8c11 1465 for ( ; uc < e ; uc += len ) {
c445ea15 1466
786e8c11 1467 TRIE_READ_CHAR;
c445ea15 1468
786e8c11
YO
1469 if ( uvc < 256 ) {
1470 charid = trie->charmap[ uvc ];
c445ea15 1471 } else {
786e8c11
YO
1472 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1473 if ( !svpp ) {
1474 charid = 0;
1475 } else {
1476 charid=(U16)SvIV( *svpp );
1477 }
c445ea15 1478 }
786e8c11
YO
1479 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1480 if ( charid ) {
a3621e74 1481
786e8c11
YO
1482 U16 check;
1483 U32 newstate = 0;
a3621e74 1484
786e8c11
YO
1485 charid--;
1486 if ( !trie->states[ state ].trans.list ) {
1487 TRIE_LIST_NEW( state );
c445ea15 1488 }
786e8c11
YO
1489 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1490 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1491 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1492 break;
1493 }
1494 }
1495 if ( ! newstate ) {
1496 newstate = next_alloc++;
1497 TRIE_LIST_PUSH( state, charid, newstate );
1498 transcount++;
1499 }
1500 state = newstate;
1501 } else {
1502 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1503 }
a28509cc 1504 }
c445ea15 1505 }
3dab1dad 1506 TRIE_HANDLE_WORD(state);
a3621e74
YO
1507
1508 } /* end second pass */
1509
1e2e3d02
YO
1510 /* next alloc is the NEXT state to be allocated */
1511 trie->statecount = next_alloc;
446bd890
NC
1512 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1513 * sizeof(reg_trie_state) );
a3621e74 1514
3dab1dad
YO
1515 /* and now dump it out before we compress it */
1516 DEBUG_TRIE_COMPILE_MORE_r(
1517 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1518 );
a3621e74 1519
446bd890
NC
1520 trie->trans
1521 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1522 {
1523 U32 state;
a3621e74
YO
1524 U32 tp = 0;
1525 U32 zp = 0;
1526
1527
1528 for( state=1 ; state < next_alloc ; state ++ ) {
1529 U32 base=0;
1530
1531 /*
1532 DEBUG_TRIE_COMPILE_MORE_r(
1533 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1534 );
1535 */
1536
1537 if (trie->states[state].trans.list) {
1538 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1539 U16 maxid=minid;
a28509cc 1540 U16 idx;
a3621e74
YO
1541
1542 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1543 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1544 if ( forid < minid ) {
1545 minid=forid;
1546 } else if ( forid > maxid ) {
1547 maxid=forid;
1548 }
a3621e74
YO
1549 }
1550 if ( transcount < tp + maxid - minid + 1) {
1551 transcount *= 2;
446bd890
NC
1552 trie->trans
1553 = PerlMemShared_realloc( trie->trans,
1554 transcount
1555 * sizeof(reg_trie_trans) );
a3621e74
YO
1556 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1557 }
1558 base = trie->uniquecharcount + tp - minid;
1559 if ( maxid == minid ) {
1560 U32 set = 0;
1561 for ( ; zp < tp ; zp++ ) {
1562 if ( ! trie->trans[ zp ].next ) {
1563 base = trie->uniquecharcount + zp - minid;
1564 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ zp ].check = state;
1566 set = 1;
1567 break;
1568 }
1569 }
1570 if ( !set ) {
1571 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1572 trie->trans[ tp ].check = state;
1573 tp++;
1574 zp = tp;
1575 }
1576 } else {
1577 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1578 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1579 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1580 trie->trans[ tid ].check = state;
1581 }
1582 tp += ( maxid - minid + 1 );
1583 }
1584 Safefree(trie->states[ state ].trans.list);
1585 }
1586 /*
1587 DEBUG_TRIE_COMPILE_MORE_r(
1588 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1589 );
1590 */
1591 trie->states[ state ].trans.base=base;
1592 }
cc601c31 1593 trie->lasttrans = tp + 1;
a3621e74
YO
1594 }
1595 } else {
1596 /*
1597 Second Pass -- Flat Table Representation.
1598
1599 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1600 We know that we will need Charcount+1 trans at most to store the data
1601 (one row per char at worst case) So we preallocate both structures
1602 assuming worst case.
1603
1604 We then construct the trie using only the .next slots of the entry
1605 structs.
1606
1607 We use the .check field of the first entry of the node temporarily to
1608 make compression both faster and easier by keeping track of how many non
1609 zero fields are in the node.
1610
1611 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1612 transition.
1613
1614 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1615 number representing the first entry of the node, and state as a
1616 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1617 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1618 are 2 entrys per node. eg:
1619
1620 A B A B
1621 1. 2 4 1. 3 7
1622 2. 0 3 3. 0 5
1623 3. 0 0 5. 0 0
1624 4. 0 0 7. 0 0
1625
1626 The table is internally in the right hand, idx form. However as we also
1627 have to deal with the states array which is indexed by nodenum we have to
1628 use TRIE_NODENUM() to convert.
1629
1630 */
1e2e3d02
YO
1631 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1632 "%*sCompiling trie using table compiler\n",
1633 (int)depth * 2 + 2, ""));
3dab1dad 1634
446bd890
NC
1635 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1636 * trie->uniquecharcount + 1,
1637 sizeof(reg_trie_trans) );
1638 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1639 sizeof(reg_trie_state) );
a3621e74
YO
1640 next_alloc = trie->uniquecharcount + 1;
1641
3dab1dad 1642
a3621e74
YO
1643 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1644
c445ea15 1645 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1646 const U8 *uc = (U8*)STRING( noper );
1647 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1648
1649 U32 state = 1; /* required init */
1650
1651 U16 charid = 0; /* sanity init */
1652 U32 accept_state = 0; /* sanity init */
1653 U8 *scan = (U8*)NULL; /* sanity init */
1654
1655 STRLEN foldlen = 0; /* required init */
07be1b83 1656 U32 wordlen = 0; /* required init */
a3621e74
YO
1657 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658
3dab1dad 1659 if ( OP(noper) != NOTHING ) {
786e8c11 1660 for ( ; uc < e ; uc += len ) {
a3621e74 1661
786e8c11 1662 TRIE_READ_CHAR;
a3621e74 1663
786e8c11
YO
1664 if ( uvc < 256 ) {
1665 charid = trie->charmap[ uvc ];
1666 } else {
1667 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1668 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1669 }
786e8c11
YO
1670 if ( charid ) {
1671 charid--;
1672 if ( !trie->trans[ state + charid ].next ) {
1673 trie->trans[ state + charid ].next = next_alloc;
1674 trie->trans[ state ].check++;
1675 next_alloc += trie->uniquecharcount;
1676 }
1677 state = trie->trans[ state + charid ].next;
1678 } else {
1679 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1680 }
1681 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1682 }
a3621e74 1683 }
3dab1dad
YO
1684 accept_state = TRIE_NODENUM( state );
1685 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1686
1687 } /* end second pass */
1688
3dab1dad
YO
1689 /* and now dump it out before we compress it */
1690 DEBUG_TRIE_COMPILE_MORE_r(
1691 dump_trie_interim_table(trie,next_alloc,depth+1)
1692 );
a3621e74 1693
a3621e74
YO
1694 {
1695 /*
1696 * Inplace compress the table.*
1697
1698 For sparse data sets the table constructed by the trie algorithm will
1699 be mostly 0/FAIL transitions or to put it another way mostly empty.
1700 (Note that leaf nodes will not contain any transitions.)
1701
1702 This algorithm compresses the tables by eliminating most such
1703 transitions, at the cost of a modest bit of extra work during lookup:
1704
1705 - Each states[] entry contains a .base field which indicates the
1706 index in the state[] array wheres its transition data is stored.
1707
1708 - If .base is 0 there are no valid transitions from that node.
1709
1710 - If .base is nonzero then charid is added to it to find an entry in
1711 the trans array.
1712
1713 -If trans[states[state].base+charid].check!=state then the
1714 transition is taken to be a 0/Fail transition. Thus if there are fail
1715 transitions at the front of the node then the .base offset will point
1716 somewhere inside the previous nodes data (or maybe even into a node
1717 even earlier), but the .check field determines if the transition is
1718 valid.
1719
786e8c11 1720 XXX - wrong maybe?
a3621e74
YO
1721 The following process inplace converts the table to the compressed
1722 table: We first do not compress the root node 1,and mark its all its
1723 .check pointers as 1 and set its .base pointer as 1 as well. This
1724 allows to do a DFA construction from the compressed table later, and
1725 ensures that any .base pointers we calculate later are greater than
1726 0.
1727
1728 - We set 'pos' to indicate the first entry of the second node.
1729
1730 - We then iterate over the columns of the node, finding the first and
1731 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1732 and set the .check pointers accordingly, and advance pos
1733 appropriately and repreat for the next node. Note that when we copy
1734 the next pointers we have to convert them from the original
1735 NODEIDX form to NODENUM form as the former is not valid post
1736 compression.
1737
1738 - If a node has no transitions used we mark its base as 0 and do not
1739 advance the pos pointer.
1740
1741 - If a node only has one transition we use a second pointer into the
1742 structure to fill in allocated fail transitions from other states.
1743 This pointer is independent of the main pointer and scans forward
1744 looking for null transitions that are allocated to a state. When it
1745 finds one it writes the single transition into the "hole". If the
786e8c11 1746 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1747
1748 - Once compressed we can Renew/realloc the structures to release the
1749 excess space.
1750
1751 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1752 specifically Fig 3.47 and the associated pseudocode.
1753
1754 demq
1755 */
a3b680e6 1756 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1757 U32 state, charid;
a3621e74 1758 U32 pos = 0, zp=0;
1e2e3d02 1759 trie->statecount = laststate;
a3621e74
YO
1760
1761 for ( state = 1 ; state < laststate ; state++ ) {
1762 U8 flag = 0;
a28509cc
AL
1763 const U32 stateidx = TRIE_NODEIDX( state );
1764 const U32 o_used = trie->trans[ stateidx ].check;
1765 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1766 trie->trans[ stateidx ].check = 0;
1767
1768 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1769 if ( flag || trie->trans[ stateidx + charid ].next ) {
1770 if ( trie->trans[ stateidx + charid ].next ) {
1771 if (o_used == 1) {
1772 for ( ; zp < pos ; zp++ ) {
1773 if ( ! trie->trans[ zp ].next ) {
1774 break;
1775 }
1776 }
1777 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1778 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ zp ].check = state;
1780 if ( ++zp > pos ) pos = zp;
1781 break;
1782 }
1783 used--;
1784 }
1785 if ( !flag ) {
1786 flag = 1;
1787 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1788 }
1789 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1790 trie->trans[ pos ].check = state;
1791 pos++;
1792 }
1793 }
1794 }
cc601c31 1795 trie->lasttrans = pos + 1;
446bd890
NC
1796 trie->states = PerlMemShared_realloc( trie->states, laststate
1797 * sizeof(reg_trie_state) );
a3621e74 1798 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1799 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1800 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1801 (int)depth * 2 + 2,"",
1802 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1803 (IV)next_alloc,
1804 (IV)pos,
a3621e74
YO
1805 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1806 );
1807
1808 } /* end table compress */
1809 }
1e2e3d02
YO
1810 DEBUG_TRIE_COMPILE_MORE_r(
1811 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1812 (int)depth * 2 + 2, "",
1813 (UV)trie->statecount,
1814 (UV)trie->lasttrans)
1815 );
cc601c31 1816 /* resize the trans array to remove unused space */
446bd890
NC
1817 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1818 * sizeof(reg_trie_trans) );
a3621e74 1819
3dab1dad
YO
1820 /* and now dump out the compressed format */
1821 DEBUG_TRIE_COMPILE_r(
1822 dump_trie(trie,depth+1)
1823 );
07be1b83 1824
3dab1dad 1825 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1826 U8 nodetype =(U8)(flags & 0xFF);
1827 char *str=NULL;
786e8c11 1828
07be1b83 1829#ifdef DEBUGGING
e62cc96a 1830 regnode *optimize = NULL;
b57a0404
JH
1831 U32 mjd_offset = 0;
1832 U32 mjd_nodelen = 0;
07be1b83 1833#endif
a3621e74 1834 /*
3dab1dad
YO
1835 This means we convert either the first branch or the first Exact,
1836 depending on whether the thing following (in 'last') is a branch
1837 or not and whther first is the startbranch (ie is it a sub part of
1838 the alternation or is it the whole thing.)
1839 Assuming its a sub part we conver the EXACT otherwise we convert
1840 the whole branch sequence, including the first.
a3621e74 1841 */
3dab1dad 1842 /* Find the node we are going to overwrite */
7f69552c 1843 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1844 /* branch sub-chain */
3dab1dad 1845 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1846 DEBUG_r({
1847 mjd_offset= Node_Offset((convert));
1848 mjd_nodelen= Node_Length((convert));
1849 });
7f69552c
YO
1850 /* whole branch chain */
1851 } else {
1852 DEBUG_r({
1853 const regnode *nop = NEXTOPER( convert );
1854 mjd_offset= Node_Offset((nop));
1855 mjd_nodelen= Node_Length((nop));
1856 });
07be1b83 1857 }
7f69552c 1858
07be1b83
YO
1859 DEBUG_OPTIMISE_r(
1860 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1861 (int)depth * 2 + 2, "",
786e8c11 1862 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1863 );
a3621e74 1864
3dab1dad
YO
1865 /* But first we check to see if there is a common prefix we can
1866 split out as an EXACT and put in front of the TRIE node. */
1867 trie->startstate= 1;
786e8c11 1868 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1869 U32 state;
1e2e3d02 1870 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1871 U32 ofs = 0;
8e11feef
RGS
1872 I32 idx = -1;
1873 U32 count = 0;
1874 const U32 base = trie->states[ state ].trans.base;
a3621e74 1875
3dab1dad 1876 if ( trie->states[state].wordnum )
8e11feef 1877 count = 1;
a3621e74 1878
8e11feef 1879 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1880 if ( ( base + ofs >= trie->uniquecharcount ) &&
1881 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1882 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1883 {
3dab1dad 1884 if ( ++count > 1 ) {
8e11feef 1885 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1886 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1887 if ( state == 1 ) break;
3dab1dad
YO
1888 if ( count == 2 ) {
1889 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1890 DEBUG_OPTIMISE_r(
8e11feef
RGS
1891 PerlIO_printf(Perl_debug_log,
1892 "%*sNew Start State=%"UVuf" Class: [",
1893 (int)depth * 2 + 2, "",
786e8c11 1894 (UV)state));
be8e71aa
YO
1895 if (idx >= 0) {
1896 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1898
3dab1dad 1899 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1900 if ( folder )
1901 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1902 DEBUG_OPTIMISE_r(
07be1b83 1903 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1904 );
8e11feef
RGS
1905 }
1906 }
1907 TRIE_BITMAP_SET(trie,*ch);
1908 if ( folder )
1909 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1910 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1911 }
1912 idx = ofs;
1913 }
3dab1dad
YO
1914 }
1915 if ( count == 1 ) {
1916 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1917 char *ch = SvPV_nolen( *tmp );
1918 DEBUG_OPTIMISE_r({
1919 SV *sv=sv_newmortal();
8e11feef
RGS
1920 PerlIO_printf( Perl_debug_log,
1921 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1922 (int)depth * 2 + 2, "",
de734bd5
A
1923 (UV)state, (UV)idx,
1924 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1925 PL_colors[0], PL_colors[1],
1926 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1927 PERL_PV_ESCAPE_FIRSTCHAR
1928 )
1929 );
1930 });
3dab1dad
YO
1931 if ( state==1 ) {
1932 OP( convert ) = nodetype;
1933 str=STRING(convert);
1934 STR_LEN(convert)=0;
1935 }
de734bd5
A
1936 while (*ch) {
1937 *str++ = *ch++;
1938 STR_LEN(convert)++;
1939 }
1940
8e11feef 1941 } else {
f9049ba1 1942#ifdef DEBUGGING
8e11feef
RGS
1943 if (state>1)
1944 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1945#endif
8e11feef
RGS
1946 break;
1947 }
1948 }
3dab1dad 1949 if (str) {
8e11feef 1950 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1951 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1952 trie->startstate = state;
07be1b83
YO
1953 trie->minlen -= (state - 1);
1954 trie->maxlen -= (state - 1);
1955 DEBUG_r({
1956 regnode *fix = convert;
de734bd5 1957 U32 word = trie->wordcount;
07be1b83
YO
1958 mjd_nodelen++;
1959 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1960 while( ++fix < n ) {
1961 Set_Node_Offset_Length(fix, 0, 0);
1962 }
de734bd5
A
1963 while (word--) {
1964 SV ** const tmp = av_fetch( trie->words, word, 0 );
1965 if (tmp) {
1966 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1967 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1968 else
1969 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1970 }
1971 }
07be1b83 1972 });
8e11feef
RGS
1973 if (trie->maxlen) {
1974 convert = n;
1975 } else {
3dab1dad 1976 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1977 DEBUG_r(optimize= n);
3dab1dad
YO
1978 }
1979 }
1980 }
a5ca303d
YO
1981 if (!jumper)
1982 jumper = last;
3dab1dad 1983 if ( trie->maxlen ) {
8e11feef
RGS
1984 NEXT_OFF( convert ) = (U16)(tail - convert);
1985 ARG_SET( convert, data_slot );
786e8c11
YO
1986 /* Store the offset to the first unabsorbed branch in
1987 jump[0], which is otherwise unused by the jump logic.
1988 We use this when dumping a trie and during optimisation. */
1989 if (trie->jump)
7f69552c 1990 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1991
786e8c11
YO
1992 /* XXXX */
1993 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1994 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1995 {
1996 OP( convert ) = TRIEC;
1997 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 1998 PerlMemShared_free(trie->bitmap);
786e8c11
YO
1999 trie->bitmap= NULL;
2000 } else
2001 OP( convert ) = TRIE;
a3621e74 2002
3dab1dad
YO
2003 /* store the type in the flags */
2004 convert->flags = nodetype;
a5ca303d
YO
2005 DEBUG_r({
2006 optimize = convert
2007 + NODE_STEP_REGNODE
2008 + regarglen[ OP( convert ) ];
2009 });
2010 /* XXX We really should free up the resource in trie now,
2011 as we won't use them - (which resources?) dmq */
3dab1dad 2012 }
a3621e74 2013 /* needed for dumping*/
e62cc96a 2014 DEBUG_r(if (optimize) {
07be1b83 2015 regnode *opt = convert;
e62cc96a 2016 while ( ++opt < optimize) {
07be1b83
YO
2017 Set_Node_Offset_Length(opt,0,0);
2018 }
786e8c11
YO
2019 /*
2020 Try to clean up some of the debris left after the
2021 optimisation.
a3621e74 2022 */
786e8c11 2023 while( optimize < jumper ) {
07be1b83 2024 mjd_nodelen += Node_Length((optimize));
a3621e74 2025 OP( optimize ) = OPTIMIZED;
07be1b83 2026 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2027 optimize++;
2028 }
07be1b83 2029 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2030 });
2031 } /* end node insert */
07be1b83 2032#ifndef DEBUGGING
6e8b4190 2033 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2034#endif
786e8c11
YO
2035 return trie->jump
2036 ? MADE_JUMP_TRIE
2037 : trie->startstate>1
2038 ? MADE_EXACT_TRIE
2039 : MADE_TRIE;
2040}
2041
2042STATIC void
2043S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2044{
2045/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2046
2047 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2048 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2049 ISBN 0-201-10088-6
2050
2051 We find the fail state for each state in the trie, this state is the longest proper
2052 suffix of the current states 'word' that is also a proper prefix of another word in our
2053 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2054 the DFA not to have to restart after its tried and failed a word at a given point, it
2055 simply continues as though it had been matching the other word in the first place.
2056 Consider
2057 'abcdgu'=~/abcdefg|cdgu/
2058 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2059 fail, which would bring use to the state representing 'd' in the second word where we would
2060 try 'g' and succeed, prodceding to match 'cdgu'.
2061 */
2062 /* add a fail transition */
f8fc2ecf 2063 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[ARG(source)];
786e8c11
YO
2064 U32 *q;
2065 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2066 const U32 numstates = trie->statecount;
786e8c11
YO
2067 const U32 ubound = trie->lasttrans + ucharcount;
2068 U32 q_read = 0;
2069 U32 q_write = 0;
2070 U32 charid;
2071 U32 base = trie->states[ 1 ].trans.base;
2072 U32 *fail;
2073 reg_ac_data *aho;
2074 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2075 GET_RE_DEBUG_FLAGS_DECL;
2076#ifndef DEBUGGING
2077 PERL_UNUSED_ARG(depth);
2078#endif
2079
2080
2081 ARG_SET( stclass, data_slot );
446bd890 2082 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2083 RExC_rxi->data->data[ data_slot ] = (void*)aho;
786e8c11 2084 aho->trie=trie;
446bd890
NC
2085 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2086 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2087 Newxz( q, numstates, U32);
446bd890 2088 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2089 aho->refcount = 1;
2090 fail = aho->fail;
2091 /* initialize fail[0..1] to be 1 so that we always have
2092 a valid final fail state */
2093 fail[ 0 ] = fail[ 1 ] = 1;
2094
2095 for ( charid = 0; charid < ucharcount ; charid++ ) {
2096 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2097 if ( newstate ) {
2098 q[ q_write ] = newstate;
2099 /* set to point at the root */
2100 fail[ q[ q_write++ ] ]=1;
2101 }
2102 }
2103 while ( q_read < q_write) {
2104 const U32 cur = q[ q_read++ % numstates ];
2105 base = trie->states[ cur ].trans.base;
2106
2107 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2108 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2109 if (ch_state) {
2110 U32 fail_state = cur;
2111 U32 fail_base;
2112 do {
2113 fail_state = fail[ fail_state ];
2114 fail_base = aho->states[ fail_state ].trans.base;
2115 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2116
2117 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2118 fail[ ch_state ] = fail_state;
2119 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2120 {
2121 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2122 }
2123 q[ q_write++ % numstates] = ch_state;
2124 }
2125 }
2126 }
2127 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2128 when we fail in state 1, this allows us to use the
2129 charclass scan to find a valid start char. This is based on the principle
2130 that theres a good chance the string being searched contains lots of stuff
2131 that cant be a start char.
2132 */
2133 fail[ 0 ] = fail[ 1 ] = 0;
2134 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2135 PerlIO_printf(Perl_debug_log,
2136 "%*sStclass Failtable (%"UVuf" states): 0",
2137 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2138 );
786e8c11
YO
2139 for( q_read=1; q_read<numstates; q_read++ ) {
2140 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2141 }
2142 PerlIO_printf(Perl_debug_log, "\n");
2143 });
2144 Safefree(q);
2145 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2146}
2147
786e8c11 2148
a3621e74 2149/*
5d1c421c
JH
2150 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2151 * These need to be revisited when a newer toolchain becomes available.
2152 */
2153#if defined(__sparc64__) && defined(__GNUC__)
2154# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2155# undef SPARC64_GCC_WORKAROUND
2156# define SPARC64_GCC_WORKAROUND 1
2157# endif
2158#endif
2159
07be1b83 2160#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2161 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2162 SV * const mysv=sv_newmortal(); \
2163 regnode *Next = regnext(scan); \
2164 regprop(RExC_rx, mysv, scan); \
7f69552c 2165 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2166 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2167 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2168 }});
07be1b83 2169
1de06328
YO
2170
2171
2172
2173
07be1b83
YO
2174#define JOIN_EXACT(scan,min,flags) \
2175 if (PL_regkind[OP(scan)] == EXACT) \
2176 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2177
be8e71aa 2178STATIC U32
07be1b83
YO
2179S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2180 /* Merge several consecutive EXACTish nodes into one. */
2181 regnode *n = regnext(scan);
2182 U32 stringok = 1;
2183 regnode *next = scan + NODE_SZ_STR(scan);
2184 U32 merged = 0;
2185 U32 stopnow = 0;
2186#ifdef DEBUGGING
2187 regnode *stop = scan;
72f13be8 2188 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2189#else
d47053eb
RGS
2190 PERL_UNUSED_ARG(depth);
2191#endif
2192#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2193 PERL_UNUSED_ARG(flags);
2194 PERL_UNUSED_ARG(val);
07be1b83 2195#endif
07be1b83
YO
2196 DEBUG_PEEP("join",scan,depth);
2197
2198 /* Skip NOTHING, merge EXACT*. */
2199 while (n &&
2200 ( PL_regkind[OP(n)] == NOTHING ||
2201 (stringok && (OP(n) == OP(scan))))
2202 && NEXT_OFF(n)
2203 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2204
2205 if (OP(n) == TAIL || n > next)
2206 stringok = 0;
2207 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2208 DEBUG_PEEP("skip:",n,depth);
2209 NEXT_OFF(scan) += NEXT_OFF(n);
2210 next = n + NODE_STEP_REGNODE;
2211#ifdef DEBUGGING
2212 if (stringok)
2213 stop = n;
2214#endif
2215 n = regnext(n);
2216 }
2217 else if (stringok) {
786e8c11 2218 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2219 regnode * const nnext = regnext(n);
2220
2221 DEBUG_PEEP("merg",n,depth);
2222
2223 merged++;
2224 if (oldl + STR_LEN(n) > U8_MAX)
2225 break;
2226 NEXT_OFF(scan) += NEXT_OFF(n);
2227 STR_LEN(scan) += STR_LEN(n);
2228 next = n + NODE_SZ_STR(n);
2229 /* Now we can overwrite *n : */
2230 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2231#ifdef DEBUGGING
2232 stop = next - 1;
2233#endif
2234 n = nnext;
2235 if (stopnow) break;
2236 }
2237
d47053eb
RGS
2238#ifdef EXPERIMENTAL_INPLACESCAN
2239 if (flags && !NEXT_OFF(n)) {
2240 DEBUG_PEEP("atch", val, depth);
2241 if (reg_off_by_arg[OP(n)]) {
2242 ARG_SET(n, val - n);
2243 }
2244 else {
2245 NEXT_OFF(n) = val - n;
2246 }
2247 stopnow = 1;
2248 }
07be1b83
YO
2249#endif
2250 }
2251
2252 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2253 /*
2254 Two problematic code points in Unicode casefolding of EXACT nodes:
2255
2256 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2257 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2258
2259 which casefold to
2260
2261 Unicode UTF-8
2262
2263 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2264 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2265
2266 This means that in case-insensitive matching (or "loose matching",
2267 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2268 length of the above casefolded versions) can match a target string
2269 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2270 This would rather mess up the minimum length computation.
2271
2272 What we'll do is to look for the tail four bytes, and then peek
2273 at the preceding two bytes to see whether we need to decrease
2274 the minimum length by four (six minus two).
2275
2276 Thanks to the design of UTF-8, there cannot be false matches:
2277 A sequence of valid UTF-8 bytes cannot be a subsequence of
2278 another valid sequence of UTF-8 bytes.
2279
2280 */
2281 char * const s0 = STRING(scan), *s, *t;
2282 char * const s1 = s0 + STR_LEN(scan) - 1;
2283 char * const s2 = s1 - 4;
e294cc5d
JH
2284#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2285 const char t0[] = "\xaf\x49\xaf\x42";
2286#else
07be1b83 2287 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2288#endif
07be1b83
YO
2289 const char * const t1 = t0 + 3;
2290
2291 for (s = s0 + 2;
2292 s < s2 && (t = ninstr(s, s1, t0, t1));
2293 s = t + 4) {
e294cc5d
JH
2294#ifdef EBCDIC
2295 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2296 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2297#else
07be1b83
YO
2298 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2299 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2300#endif
07be1b83
YO
2301 *min -= 4;
2302 }
2303 }
2304
2305#ifdef DEBUGGING
2306 /* Allow dumping */
2307 n = scan + NODE_SZ_STR(scan);
2308 while (n <= stop) {
2309 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2310 OP(n) = OPTIMIZED;
2311 NEXT_OFF(n) = 0;
2312 }
2313 n++;
2314 }
2315#endif
2316 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2317 return stopnow;
2318}
2319
653099ff
GS
2320/* REx optimizer. Converts nodes into quickier variants "in place".
2321 Finds fixed substrings. */
2322
a0288114 2323/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2324 to the position after last scanned or to NULL. */
2325
40d049e4
YO
2326#define INIT_AND_WITHP \
2327 assert(!and_withp); \
2328 Newx(and_withp,1,struct regnode_charclass_class); \
2329 SAVEFREEPV(and_withp)
07be1b83 2330
b515a41d
YO
2331/* this is a chain of data about sub patterns we are processing that
2332 need to be handled seperately/specially in study_chunk. Its so
2333 we can simulate recursion without losing state. */
2334struct scan_frame;
2335typedef struct scan_frame {
2336 regnode *last; /* last node to process in this frame */
2337 regnode *next; /* next node to process when last is reached */
2338 struct scan_frame *prev; /*previous frame*/
2339 I32 stop; /* what stopparen do we use */
2340} scan_frame;
2341
76e3520e 2342STATIC I32
40d049e4 2343S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2344 I32 *minlenp, I32 *deltap,
40d049e4
YO
2345 regnode *last,
2346 scan_data_t *data,
2347 I32 stopparen,
2348 U8* recursed,
2349 struct regnode_charclass_class *and_withp,
2350 U32 flags, U32 depth)
c277df42
IZ
2351 /* scanp: Start here (read-write). */
2352 /* deltap: Write maxlen-minlen here. */
2353 /* last: Stop before this one. */
40d049e4
YO
2354 /* data: string data about the pattern */
2355 /* stopparen: treat close N as END */
2356 /* recursed: which subroutines have we recursed into */
2357 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2358{
97aff369 2359 dVAR;
c277df42
IZ
2360 I32 min = 0, pars = 0, code;
2361 regnode *scan = *scanp, *next;
2362 I32 delta = 0;
2363 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2364 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2365 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2366 scan_data_t data_fake;
a3621e74 2367 SV *re_trie_maxbuff = NULL;
786e8c11 2368 regnode *first_non_open = scan;
e2e6a0f1 2369 I32 stopmin = I32_MAX;
8aa23a47
YO
2370 scan_frame *frame = NULL;
2371
a3621e74 2372 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2373
13a24bad 2374#ifdef DEBUGGING
40d049e4 2375 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2376#endif
40d049e4 2377
786e8c11 2378 if ( depth == 0 ) {
40d049e4 2379 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2380 first_non_open=regnext(first_non_open);
2381 }
2382
b81d288d 2383
8aa23a47
YO
2384 fake_study_recurse:
2385 while ( scan && OP(scan) != END && scan < last ){
2386 /* Peephole optimizer: */
2387 DEBUG_STUDYDATA(data,depth);
2388 DEBUG_PEEP("Peep",scan,depth);
2389 JOIN_EXACT(scan,&min,0);
2390
2391 /* Follow the next-chain of the current node and optimize
2392 away all the NOTHINGs from it. */
2393 if (OP(scan) != CURLYX) {
2394 const int max = (reg_off_by_arg[OP(scan)]
2395 ? I32_MAX
2396 /* I32 may be smaller than U16 on CRAYs! */
2397 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2398 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2399 int noff;
2400 regnode *n = scan;
2401
2402 /* Skip NOTHING and LONGJMP. */
2403 while ((n = regnext(n))
2404 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2405 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2406 && off + noff < max)
2407 off += noff;
2408 if (reg_off_by_arg[OP(scan)])
2409 ARG(scan) = off;
2410 else
2411 NEXT_OFF(scan) = off;
2412 }
a3621e74 2413
c277df42 2414
8aa23a47
YO
2415
2416 /* The principal pseudo-switch. Cannot be a switch, since we
2417 look into several different things. */
2418 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2419 || OP(scan) == IFTHEN) {
2420 next = regnext(scan);
2421 code = OP(scan);
2422 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2423
2424 if (OP(next) == code || code == IFTHEN) {
2425 /* NOTE - There is similar code to this block below for handling
2426 TRIE nodes on a re-study. If you change stuff here check there
2427 too. */
2428 I32 max1 = 0, min1 = I32_MAX, num = 0;
2429 struct regnode_charclass_class accum;
2430 regnode * const startbranch=scan;
2431
2432 if (flags & SCF_DO_SUBSTR)
2433 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2434 if (flags & SCF_DO_STCLASS)
2435 cl_init_zero(pRExC_state, &accum);
2436
2437 while (OP(scan) == code) {
2438 I32 deltanext, minnext, f = 0, fake;
2439 struct regnode_charclass_class this_class;
2440
2441 num++;
2442 data_fake.flags = 0;
2443 if (data) {
2444 data_fake.whilem_c = data->whilem_c;
2445 data_fake.last_closep = data->last_closep;
2446 }
2447 else
2448 data_fake.last_closep = &fake;
58e23c8d
YO
2449
2450 data_fake.pos_delta = delta;
8aa23a47
YO
2451 next = regnext(scan);
2452 scan = NEXTOPER(scan);
2453 if (code != BRANCH)
c277df42 2454 scan = NEXTOPER(scan);
8aa23a47
YO
2455 if (flags & SCF_DO_STCLASS) {
2456 cl_init(pRExC_state, &this_class);
2457 data_fake.start_class = &this_class;
2458 f = SCF_DO_STCLASS_AND;
58e23c8d 2459 }
8aa23a47
YO
2460 if (flags & SCF_WHILEM_VISITED_POS)
2461 f |= SCF_WHILEM_VISITED_POS;
2462
2463 /* we suppose the run is continuous, last=next...*/
2464 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2465 next, &data_fake,
2466 stopparen, recursed, NULL, f,depth+1);
2467 if (min1 > minnext)
2468 min1 = minnext;
2469 if (max1 < minnext + deltanext)
2470 max1 = minnext + deltanext;
2471 if (deltanext == I32_MAX)
2472 is_inf = is_inf_internal = 1;
2473 scan = next;
2474 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2475 pars++;
2476 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2477 if ( stopmin > minnext)
2478 stopmin = min + min1;
2479 flags &= ~SCF_DO_SUBSTR;
2480 if (data)
2481 data->flags |= SCF_SEEN_ACCEPT;
2482 }
2483 if (data) {
2484 if (data_fake.flags & SF_HAS_EVAL)
2485 data->flags |= SF_HAS_EVAL;
2486 data->whilem_c = data_fake.whilem_c;
3dab1dad 2487 }
8aa23a47
YO
2488 if (flags & SCF_DO_STCLASS)
2489 cl_or(pRExC_state, &accum, &this_class);
2490 }
2491 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2492 min1 = 0;
2493 if (flags & SCF_DO_SUBSTR) {
2494 data->pos_min += min1;
2495 data->pos_delta += max1 - min1;
2496 if (max1 != min1 || is_inf)
2497 data->longest = &(data->longest_float);
2498 }
2499 min += min1;
2500 delta += max1 - min1;
2501 if (flags & SCF_DO_STCLASS_OR) {
2502 cl_or(pRExC_state, data->start_class, &accum);
2503 if (min1) {
2504 cl_and(data->start_class, and_withp);
2505 flags &= ~SCF_DO_STCLASS;
653099ff 2506 }
8aa23a47
YO
2507 }
2508 else if (flags & SCF_DO_STCLASS_AND) {
2509 if (min1) {
2510 cl_and(data->start_class, &accum);
2511 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2512 }
8aa23a47
YO
2513 else {
2514 /* Switch to OR mode: cache the old value of
2515 * data->start_class */
2516 INIT_AND_WITHP;
2517 StructCopy(data->start_class, and_withp,
2518 struct regnode_charclass_class);
2519 flags &= ~SCF_DO_STCLASS_AND;
2520 StructCopy(&accum, data->start_class,
2521 struct regnode_charclass_class);
2522 flags |= SCF_DO_STCLASS_OR;
2523 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2524 }
8aa23a47 2525 }
a3621e74 2526
8aa23a47
YO
2527 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2528 /* demq.
a3621e74 2529
8aa23a47
YO
2530 Assuming this was/is a branch we are dealing with: 'scan' now
2531 points at the item that follows the branch sequence, whatever
2532 it is. We now start at the beginning of the sequence and look
2533 for subsequences of
a3621e74 2534
8aa23a47
YO
2535 BRANCH->EXACT=>x1
2536 BRANCH->EXACT=>x2
2537 tail
a3621e74 2538
8aa23a47 2539 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2540
8aa23a47
YO
2541 If we can find such a subseqence we need to turn the first
2542 element into a trie and then add the subsequent branch exact
2543 strings to the trie.
a3621e74 2544
8aa23a47 2545 We have two cases
a3621e74 2546
8aa23a47 2547 1. patterns where the whole set of branch can be converted.
a3621e74 2548
8aa23a47 2549 2. patterns where only a subset can be converted.
a3621e74 2550
8aa23a47
YO
2551 In case 1 we can replace the whole set with a single regop
2552 for the trie. In case 2 we need to keep the start and end
2553 branchs so
a3621e74 2554
8aa23a47
YO
2555 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2556 becomes BRANCH TRIE; BRANCH X;
786e8c11 2557
8aa23a47
YO
2558 There is an additional case, that being where there is a
2559 common prefix, which gets split out into an EXACT like node
2560 preceding the TRIE node.
a3621e74 2561
8aa23a47
YO
2562 If x(1..n)==tail then we can do a simple trie, if not we make
2563 a "jump" trie, such that when we match the appropriate word
2564 we "jump" to the appopriate tail node. Essentailly we turn
2565 a nested if into a case structure of sorts.
b515a41d 2566
8aa23a47
YO
2567 */
2568
2569 int made=0;
2570 if (!re_trie_maxbuff) {
2571 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2572 if (!SvIOK(re_trie_maxbuff))
2573 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2574 }
2575 if ( SvIV(re_trie_maxbuff)>=0 ) {
2576 regnode *cur;
2577 regnode *first = (regnode *)NULL;
2578 regnode *last = (regnode *)NULL;
2579 regnode *tail = scan;
2580 U8 optype = 0;
2581 U32 count=0;
a3621e74
YO
2582
2583#ifdef DEBUGGING
8aa23a47 2584 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2585#endif
8aa23a47
YO
2586 /* var tail is used because there may be a TAIL
2587 regop in the way. Ie, the exacts will point to the
2588 thing following the TAIL, but the last branch will
2589 point at the TAIL. So we advance tail. If we
2590 have nested (?:) we may have to move through several
2591 tails.
2592 */
2593
2594 while ( OP( tail ) == TAIL ) {
2595 /* this is the TAIL generated by (?:) */
2596 tail = regnext( tail );
2597 }
a3621e74 2598
8aa23a47
YO
2599
2600 DEBUG_OPTIMISE_r({
2601 regprop(RExC_rx, mysv, tail );
2602 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2603 (int)depth * 2 + 2, "",
2604 "Looking for TRIE'able sequences. Tail node is: ",
2605 SvPV_nolen_const( mysv )
2606 );
2607 });
2608
2609 /*
2610
2611 step through the branches, cur represents each
2612 branch, noper is the first thing to be matched
2613 as part of that branch and noper_next is the
2614 regnext() of that node. if noper is an EXACT
2615 and noper_next is the same as scan (our current
2616 position in the regex) then the EXACT branch is
2617 a possible optimization target. Once we have
2618 two or more consequetive such branches we can
2619 create a trie of the EXACT's contents and stich
2620 it in place. If the sequence represents all of
2621 the branches we eliminate the whole thing and
2622 replace it with a single TRIE. If it is a
2623 subsequence then we need to stitch it in. This
2624 means the first branch has to remain, and needs
2625 to be repointed at the item on the branch chain
2626 following the last branch optimized. This could
2627 be either a BRANCH, in which case the
2628 subsequence is internal, or it could be the
2629 item following the branch sequence in which
2630 case the subsequence is at the end.
2631
2632 */
2633
2634 /* dont use tail as the end marker for this traverse */
2635 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2636 regnode * const noper = NEXTOPER( cur );
b515a41d 2637#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2638 regnode * const noper_next = regnext( noper );
b515a41d
YO
2639#endif
2640
8aa23a47
YO
2641 DEBUG_OPTIMISE_r({
2642 regprop(RExC_rx, mysv, cur);
2643 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2644 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2645
2646 regprop(RExC_rx, mysv, noper);
2647 PerlIO_printf( Perl_debug_log, " -> %s",
2648 SvPV_nolen_const(mysv));
2649
2650 if ( noper_next ) {
2651 regprop(RExC_rx, mysv, noper_next );
2652 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2653 SvPV_nolen_const(mysv));
2654 }
2655 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2656 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2657 });
2658 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2659 : PL_regkind[ OP( noper ) ] == EXACT )
2660 || OP(noper) == NOTHING )
786e8c11 2661#ifdef NOJUMPTRIE
8aa23a47 2662 && noper_next == tail
786e8c11 2663#endif
8aa23a47
YO
2664 && count < U16_MAX)
2665 {
2666 count++;
2667 if ( !first || optype == NOTHING ) {
2668 if (!first) first = cur;
2669 optype = OP( noper );
2670 } else {
2671 last = cur;
2672 }
2673 } else {
2674 if ( last ) {
2675 make_trie( pRExC_state,
2676 startbranch, first, cur, tail, count,
2677 optype, depth+1 );
2678 }
2679 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2680#ifdef NOJUMPTRIE
8aa23a47 2681 && noper_next == tail
786e8c11 2682#endif
8aa23a47
YO
2683 ){
2684 count = 1;
2685 first = cur;
2686 optype = OP( noper );
2687 } else {
2688 count = 0;
2689 first = NULL;
2690 optype = 0;
2691 }
2692 last = NULL;
2693 }
2694 }
2695 DEBUG_OPTIMISE_r({
2696 regprop(RExC_rx, mysv, cur);
2697 PerlIO_printf( Perl_debug_log,
2698 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2699 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2700
2701 });
2702 if ( last ) {
2703 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2704#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2705 if ( ((made == MADE_EXACT_TRIE &&
2706 startbranch == first)
2707 || ( first_non_open == first )) &&
2708 depth==0 ) {
2709 flags |= SCF_TRIE_RESTUDY;
2710 if ( startbranch == first
2711 && scan == tail )
2712 {
2713 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2714 }
2715 }
3dab1dad 2716#endif
8aa23a47
YO
2717 }
2718 }
2719
2720 } /* do trie */
2721
653099ff 2722 }
8aa23a47
YO
2723 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2724 scan = NEXTOPER(NEXTOPER(scan));
2725 } else /* single branch is optimized. */
2726 scan = NEXTOPER(scan);
2727 continue;
2728 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2729 scan_frame *newframe = NULL;
2730 I32 paren;
2731 regnode *start;
2732 regnode *end;
2733
2734 if (OP(scan) != SUSPEND) {
2735 /* set the pointer */
2736 if (OP(scan) == GOSUB) {
2737 paren = ARG(scan);
2738 RExC_recurse[ARG2L(scan)] = scan;
2739 start = RExC_open_parens[paren-1];
2740 end = RExC_close_parens[paren-1];
2741 } else {
2742 paren = 0;
f8fc2ecf 2743 start = RExC_rxi->program + 1;
8aa23a47
YO
2744 end = RExC_opend;
2745 }
2746 if (!recursed) {
2747 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2748 SAVEFREEPV(recursed);
2749 }
2750 if (!PAREN_TEST(recursed,paren+1)) {
2751 PAREN_SET(recursed,paren+1);
2752 Newx(newframe,1,scan_frame);
2753 } else {
2754 if (flags & SCF_DO_SUBSTR) {
2755 scan_commit(pRExC_state,data,minlenp);
2756 data->longest = &(data->longest_float);
2757 }
2758 is_inf = is_inf_internal = 1;
2759 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2760 cl_anything(pRExC_state, data->start_class);
2761 flags &= ~SCF_DO_STCLASS;
2762 }
2763 } else {
2764 Newx(newframe,1,scan_frame);
2765 paren = stopparen;
2766 start = scan+2;
2767 end = regnext(scan);
2768 }
2769 if (newframe) {
2770 assert(start);
2771 assert(end);
2772 SAVEFREEPV(newframe);
2773 newframe->next = regnext(scan);
2774 newframe->last = last;
2775 newframe->stop = stopparen;
2776 newframe->prev = frame;
2777
2778 frame = newframe;
2779 scan = start;
2780 stopparen = paren;
2781 last = end;
2782
2783 continue;
2784 }
2785 }
2786 else if (OP(scan) == EXACT) {
2787 I32 l = STR_LEN(scan);
2788 UV uc;
2789 if (UTF) {
2790 const U8 * const s = (U8*)STRING(scan);
2791 l = utf8_length(s, s + l);
2792 uc = utf8_to_uvchr(s, NULL);
2793 } else {
2794 uc = *((U8*)STRING(scan));
2795 }
2796 min += l;
2797 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2798 /* The code below prefers earlier match for fixed
2799 offset, later match for variable offset. */
2800 if (data->last_end == -1) { /* Update the start info. */
2801 data->last_start_min = data->pos_min;
2802 data->last_start_max = is_inf
2803 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2804 }
8aa23a47
YO
2805 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2806 if (UTF)
2807 SvUTF8_on(data->last_found);
2808 {
2809 SV * const sv = data->last_found;
2810 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2811 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2812 if (mg && mg->mg_len >= 0)
2813 mg->mg_len += utf8_length((U8*)STRING(scan),
2814 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2815 }
8aa23a47
YO
2816 data->last_end = data->pos_min + l;
2817 data->pos_min += l; /* As in the first entry. */
2818 data->flags &= ~SF_BEFORE_EOL;
2819 }
2820 if (flags & SCF_DO_STCLASS_AND) {
2821 /* Check whether it is compatible with what we know already! */
2822 int compat = 1;
2823
2824 if (uc >= 0x100 ||
2825 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2826 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2827 && (!(data->start_class->flags & ANYOF_FOLD)
2828 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2829 )
2830 compat = 0;
2831 ANYOF_CLASS_ZERO(data->start_class);
2832 ANYOF_BITMAP_ZERO(data->start_class);
2833 if (compat)
2834 ANYOF_BITMAP_SET(data->start_class, uc);
2835 data->start_class->flags &= ~ANYOF_EOS;
2836 if (uc < 0x100)
2837 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2838 }
2839 else if (flags & SCF_DO_STCLASS_OR) {
2840 /* false positive possible if the class is case-folded */
2841 if (uc < 0x100)
2842 ANYOF_BITMAP_SET(data->start_class, uc);
2843 else
2844 data->start_class->flags |= ANYOF_UNICODE_ALL;
2845 data->start_class->flags &= ~ANYOF_EOS;
2846 cl_and(data->start_class, and_withp);
2847 }
2848 flags &= ~SCF_DO_STCLASS;
2849 }
2850 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2851 I32 l = STR_LEN(scan);
2852 UV uc = *((U8*)STRING(scan));
2853
2854 /* Search for fixed substrings supports EXACT only. */
2855 if (flags & SCF_DO_SUBSTR) {
2856 assert(data);
2857 scan_commit(pRExC_state, data, minlenp);
2858 }
2859 if (UTF) {
2860 const U8 * const s = (U8 *)STRING(scan);
2861 l = utf8_length(s, s + l);
2862 uc = utf8_to_uvchr(s, NULL);
2863 }
2864 min += l;
2865 if (flags & SCF_DO_SUBSTR)
2866 data->pos_min += l;
2867 if (flags & SCF_DO_STCLASS_AND) {
2868 /* Check whether it is compatible with what we know already! */
2869 int compat = 1;
2870
2871 if (uc >= 0x100 ||
2872 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2873 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2874 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2875 compat = 0;
2876 ANYOF_CLASS_ZERO(data->start_class);
2877 ANYOF_BITMAP_ZERO(data->start_class);
2878 if (compat) {
2879 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2880 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2881 data->start_class->flags |= ANYOF_FOLD;
2882 if (OP(scan) == EXACTFL)
2883 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2884 }
8aa23a47
YO
2885 }
2886 else if (flags & SCF_DO_STCLASS_OR) {
2887 if (data->start_class->flags & ANYOF_FOLD) {
2888 /* false positive possible if the class is case-folded.
2889 Assume that the locale settings are the same... */
1aa99e6b
IH
2890 if (uc < 0x100)
2891 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2892 data->start_class->flags &= ~ANYOF_EOS;
2893 }
8aa23a47 2894 cl_and(data->start_class, and_withp);
653099ff 2895 }
8aa23a47
YO
2896 flags &= ~SCF_DO_STCLASS;
2897 }
2898 else if (strchr((const char*)PL_varies,OP(scan))) {
2899 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2900 I32 f = flags, pos_before = 0;
2901 regnode * const oscan = scan;
2902 struct regnode_charclass_class this_class;
2903 struct regnode_charclass_class *oclass = NULL;
2904 I32 next_is_eval = 0;
2905
2906 switch (PL_regkind[OP(scan)]) {
2907 case WHILEM: /* End of (?:...)* . */
2908 scan = NEXTOPER(scan);
2909 goto finish;
2910 case PLUS:
2911 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2912 next = NEXTOPER(scan);
2913 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2914 mincount = 1;
2915 maxcount = REG_INFTY;
2916 next = regnext(scan);
2917 scan = NEXTOPER(scan);
2918 goto do_curly;
2919 }
2920 }
2921 if (flags & SCF_DO_SUBSTR)
2922 data->pos_min++;
2923 min++;
2924 /* Fall through. */
2925 case STAR:
2926 if (flags & SCF_DO_STCLASS) {
2927 mincount = 0;
2928 maxcount = REG_INFTY;
2929 next = regnext(scan);
2930 scan = NEXTOPER(scan);
2931 goto do_curly;
2932 }
2933 is_inf = is_inf_internal = 1;
2934 scan = regnext(scan);
c277df42 2935 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2936 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2937 data->longest = &(data->longest_float);
c277df42 2938 }
8aa23a47
YO
2939 goto optimize_curly_tail;
2940 case CURLY:
2941 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2942 && (scan->flags == stopparen))
2943 {
2944 mincount = 1;
2945 maxcount = 1;
2946 } else {
2947 mincount = ARG1(scan);
2948 maxcount = ARG2(scan);
653099ff 2949 }
8aa23a47
YO
2950 next = regnext(scan);
2951 if (OP(scan) == CURLYX) {
2952 I32 lp = (data ? *(data->last_closep) : 0);
2953 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2954 }
8aa23a47
YO
2955 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2956 next_is_eval = (OP(scan) == EVAL);
2957 do_curly:
2958 if (flags & SCF_DO_SUBSTR) {
2959 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2960 pos_before = data->pos_min;
b45f050a 2961 }
8aa23a47
YO
2962 if (data) {
2963 fl = data->flags;
2964 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2965 if (is_inf)
2966 data->flags |= SF_IS_INF;
2967 }
2968 if (flags & SCF_DO_STCLASS) {
2969 cl_init(pRExC_state, &this_class);
2970 oclass = data->start_class;
2971 data->start_class = &this_class;
2972 f |= SCF_DO_STCLASS_AND;
2973 f &= ~SCF_DO_STCLASS_OR;
2974 }
2975 /* These are the cases when once a subexpression
2976 fails at a particular position, it cannot succeed
2977 even after backtracking at the enclosing scope.
2978
2979 XXXX what if minimal match and we are at the
2980 initial run of {n,m}? */
2981 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2982 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2983
8aa23a47
YO
2984 /* This will finish on WHILEM, setting scan, or on NULL: */
2985 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2986 last, data, stopparen, recursed, NULL,
2987 (mincount == 0
2988 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2989
8aa23a47
YO
2990 if (flags & SCF_DO_STCLASS)
2991 data->start_class = oclass;
2992 if (mincount == 0 || minnext == 0) {
2993 if (flags & SCF_DO_STCLASS_OR) {
2994 cl_or(pRExC_state, data->start_class, &this_class);
2995 }
2996 else if (flags & SCF_DO_STCLASS_AND) {
2997 /* Switch to OR mode: cache the old value of
2998 * data->start_class */
2999 INIT_AND_WITHP;
3000 StructCopy(data->start_class, and_withp,
3001 struct regnode_charclass_class);
3002 flags &= ~SCF_DO_STCLASS_AND;
3003 StructCopy(&this_class, data->start_class,
3004 struct regnode_charclass_class);
3005 flags |= SCF_DO_STCLASS_OR;
3006 data->start_class->flags |= ANYOF_EOS;
3007 }
3008 } else { /* Non-zero len */
3009 if (flags & SCF_DO_STCLASS_OR) {
3010 cl_or(pRExC_state, data->start_class, &this_class);
3011 cl_and(data->start_class, and_withp);
3012 }
3013 else if (flags & SCF_DO_STCLASS_AND)
3014 cl_and(data->start_class, &this_class);
3015 flags &= ~SCF_DO_STCLASS;
3016 }
3017 if (!scan) /* It was not CURLYX, but CURLY. */
3018 scan = next;
3019 if ( /* ? quantifier ok, except for (?{ ... }) */
3020 (next_is_eval || !(mincount == 0 && maxcount == 1))
3021 && (minnext == 0) && (deltanext == 0)
3022 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3023 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3024 && ckWARN(WARN_REGEXP))
3025 {
3026 vWARN(RExC_parse,
3027 "Quantifier unexpected on zero-length expression");
3028 }
3029
3030 min += minnext * mincount;
3031 is_inf_internal |= ((maxcount == REG_INFTY
3032 && (minnext + deltanext) > 0)
3033 || deltanext == I32_MAX);
3034 is_inf |= is_inf_internal;
3035 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3036
3037 /* Try powerful optimization CURLYX => CURLYN. */
3038 if ( OP(oscan) == CURLYX && data
3039 && data->flags & SF_IN_PAR
3040 && !(data->flags & SF_HAS_EVAL)
3041 && !deltanext && minnext == 1 ) {
3042 /* Try to optimize to CURLYN. */
3043 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3044 regnode * const nxt1 = nxt;
497b47a8 3045#ifdef DEBUGGING
8aa23a47 3046 regnode *nxt2;
497b47a8 3047#endif
c277df42 3048
8aa23a47
YO
3049 /* Skip open. */
3050 nxt = regnext(nxt);
3051 if (!strchr((const char*)PL_simple,OP(nxt))
3052 && !(PL_regkind[OP(nxt)] == EXACT
3053 && STR_LEN(nxt) == 1))
3054 goto nogo;
497b47a8 3055#ifdef DEBUGGING
8aa23a47 3056 nxt2 = nxt;
497b47a8 3057#endif
8aa23a47
YO
3058 nxt = regnext(nxt);
3059 if (OP(nxt) != CLOSE)
3060 goto nogo;
3061 if (RExC_open_parens) {
3062 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3063 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3064 }
3065 /* Now we know that nxt2 is the only contents: */
3066 oscan->flags = (U8)ARG(nxt);
3067 OP(oscan) = CURLYN;
3068 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3069
c277df42 3070#ifdef DEBUGGING
8aa23a47
YO
3071 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3072 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3073 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3074 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3075 OP(nxt + 1) = OPTIMIZED; /* was count. */
3076 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3077#endif
8aa23a47
YO
3078 }
3079 nogo:
3080
3081 /* Try optimization CURLYX => CURLYM. */
3082 if ( OP(oscan) == CURLYX && data
3083 && !(data->flags & SF_HAS_PAR)
3084 && !(data->flags & SF_HAS_EVAL)
3085 && !deltanext /* atom is fixed width */
3086 && minnext != 0 /* CURLYM can't handle zero width */
3087 ) {
3088 /* XXXX How to optimize if data == 0? */
3089 /* Optimize to a simpler form. */
3090 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3091 regnode *nxt2;
3092
3093 OP(oscan) = CURLYM;
3094 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3095 && (OP(nxt2) != WHILEM))
3096 nxt = nxt2;
3097 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3098 /* Need to optimize away parenths. */
3099 if (data->flags & SF_IN_PAR) {
3100 /* Set the parenth number. */
3101 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3102
3103 if (OP(nxt) != CLOSE)
3104 FAIL("Panic opt close");
3105 oscan->flags = (U8)ARG(nxt);
3106 if (RExC_open_parens) {
3107 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3108 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3109 }
8aa23a47
YO
3110 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3111 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3112
c277df42 3113#ifdef DEBUGGING
8aa23a47
YO
3114 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3115 OP(nxt + 1) = OPTIMIZED; /* was count. */
3116 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3117 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3118#endif
c277df42 3119#if 0
8aa23a47
YO
3120 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3121 regnode *nnxt = regnext(nxt1);
3122
3123 if (nnxt == nxt) {
3124 if (reg_off_by_arg[OP(nxt1)])
3125 ARG_SET(nxt1, nxt2 - nxt1);
3126 else if (nxt2 - nxt1 < U16_MAX)
3127 NEXT_OFF(nxt1) = nxt2 - nxt1;
3128 else
3129 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3130 }
8aa23a47 3131 nxt1 = nnxt;
c277df42 3132 }
5d1c421c 3133#endif
8aa23a47
YO
3134 /* Optimize again: */
3135 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3136 NULL, stopparen, recursed, NULL, 0,depth+1);
3137 }
3138 else
3139 oscan->flags = 0;
3140 }
3141 else if ((OP(oscan) == CURLYX)
3142 && (flags & SCF_WHILEM_VISITED_POS)
3143 /* See the comment on a similar expression above.
3144 However, this time it not a subexpression
3145 we care about, but the expression itself. */
3146 && (maxcount == REG_INFTY)
3147 && data && ++data->whilem_c < 16) {
3148 /* This stays as CURLYX, we can put the count/of pair. */
3149 /* Find WHILEM (as in regexec.c) */
3150 regnode *nxt = oscan + NEXT_OFF(oscan);
3151
3152 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3153 nxt += ARG(nxt);
3154 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3155 | (RExC_whilem_seen << 4)); /* On WHILEM */
3156 }
3157 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3158 pars++;
3159 if (flags & SCF_DO_SUBSTR) {
3160 SV *last_str = NULL;
3161 int counted = mincount != 0;
a0ed51b3 3162
8aa23a47
YO
3163 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3164#if defined(SPARC64_GCC_WORKAROUND)
3165 I32 b = 0;
3166 STRLEN l = 0;
3167 const char *s = NULL;
3168 I32 old = 0;
b515a41d 3169
8aa23a47
YO
3170 if (pos_before >= data->last_start_min)
3171 b = pos_before;
3172 else
3173 b = data->last_start_min;
b515a41d 3174
8aa23a47
YO
3175 l = 0;
3176 s = SvPV_const(data->last_found, l);
3177 old = b - data->last_start_min;
3178
3179#else
3180 I32 b = pos_before >= data->last_start_min
3181 ? pos_before : data->last_start_min;
3182 STRLEN l;
3183 const char * const s = SvPV_const(data->last_found, l);
3184 I32 old = b - data->last_start_min;
3185#endif
3186
3187 if (UTF)
3188 old = utf8_hop((U8*)s, old) - (U8*)s;
3189
3190 l -= old;
3191 /* Get the added string: */
3192 last_str = newSVpvn(s + old, l);
3193 if (UTF)
3194 SvUTF8_on(last_str);
3195 if (deltanext == 0 && pos_before == b) {
3196 /* What was added is a constant string */
3197 if (mincount > 1) {
3198 SvGROW(last_str, (mincount * l) + 1);
3199 repeatcpy(SvPVX(last_str) + l,
3200 SvPVX_const(last_str), l, mincount - 1);
3201 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3202 /* Add additional parts. */
3203 SvCUR_set(data->last_found,
3204 SvCUR(data->last_found) - l);
3205 sv_catsv(data->last_found, last_str);
3206 {
3207 SV * sv = data->last_found;
3208 MAGIC *mg =
3209 SvUTF8(sv) && SvMAGICAL(sv) ?
3210 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3211 if (mg && mg->mg_len >= 0)
3212 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3213 }
8aa23a47 3214 data->last_end += l * (mincount - 1);
b515a41d 3215 }
8aa23a47
YO
3216 } else {
3217 /* start offset must point into the last copy */
3218 data->last_start_min += minnext * (mincount - 1);
3219 data->last_start_max += is_inf ? I32_MAX
3220 : (maxcount - 1) * (minnext + data->pos_delta);
3221 }
c277df42 3222 }
8aa23a47
YO
3223 /* It is counted once already... */
3224 data->pos_min += minnext * (mincount - counted);
3225 data->pos_delta += - counted * deltanext +
3226 (minnext + deltanext) * maxcount - minnext * mincount;
3227 if (mincount != maxcount) {
3228 /* Cannot extend fixed substrings found inside
3229 the group. */
3230 scan_commit(pRExC_state,data,minlenp);
3231 if (mincount && last_str) {
3232 SV * const sv = data->last_found;
3233 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3234 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3235
3236 if (mg)
3237 mg->mg_len = -1;
3238 sv_setsv(sv, last_str);
3239 data->last_end = data->pos_min;
3240 data->last_start_min =
3241 data->pos_min - CHR_SVLEN(last_str);
3242 data->last_start_max = is_inf
3243 ? I32_MAX
3244 : data->pos_min + data->pos_delta
3245 - CHR_SVLEN(last_str);
3246 }
3247 data->longest = &(data->longest_float);
3248 }
3249 SvREFCNT_dec(last_str);
c277df42 3250 }
8aa23a47
YO
3251 if (data && (fl & SF_HAS_EVAL))
3252 data->flags |= SF_HAS_EVAL;
3253 optimize_curly_tail:
3254 if (OP(oscan) != CURLYX) {
3255 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3256 && NEXT_OFF(next))
3257 NEXT_OFF(oscan) += NEXT_OFF(next);
3258 }
3259 continue;
3260 default: /* REF and CLUMP only? */
3261 if (flags & SCF_DO_SUBSTR) {
3262 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3263 data->longest = &(data->longest_float);
3264 }
3265 is_inf = is_inf_internal = 1;
3266 if (flags & SCF_DO_STCLASS_OR)
3267 cl_anything(pRExC_state, data->start_class);
3268 flags &= ~SCF_DO_STCLASS;
3269 break;
c277df42 3270 }
8aa23a47
YO
3271 }
3272 else if (strchr((const char*)PL_simple,OP(scan))) {
3273 int value = 0;
653099ff 3274
8aa23a47
YO
3275 if (flags & SCF_DO_SUBSTR) {
3276 scan_commit(pRExC_state,data,minlenp);
3277 data->pos_min++;
3278 }
3279 min++;
3280 if (flags & SCF_DO_STCLASS) {
3281 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3282
8aa23a47
YO
3283 /* Some of the logic below assumes that switching
3284 locale on will only add false positives. */
3285 switch (PL_regkind[OP(scan)]) {
3286 case SANY:
3287 default:
3288 do_default:
3289 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3290 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3291 cl_anything(pRExC_state, data->start_class);
3292 break;
3293 case REG_ANY:
3294 if (OP(scan) == SANY)
3295 goto do_default;
3296 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3297 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3298 || (data->start_class->flags & ANYOF_CLASS));
3299 cl_anything(pRExC_state, data->start_class);
653099ff 3300 }
8aa23a47
YO
3301 if (flags & SCF_DO_STCLASS_AND || !value)
3302 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3303 break;
3304 case ANYOF:
3305 if (flags & SCF_DO_STCLASS_AND)
3306 cl_and(data->start_class,
3307 (struct regnode_charclass_class*)scan);
653099ff 3308 else
8aa23a47
YO
3309 cl_or(pRExC_state, data->start_class,
3310 (struct regnode_charclass_class*)scan);
3311 break;
3312 case ALNUM:
3313 if (flags & SCF_DO_STCLASS_AND) {
3314 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3315 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3316 for (value = 0; value < 256; value++)
3317 if (!isALNUM(value))
3318 ANYOF_BITMAP_CLEAR(data->start_class, value);
3319 }
653099ff 3320 }
8aa23a47
YO
3321 else {
3322 if (data->start_class->flags & ANYOF_LOCALE)
3323 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3324 else {
3325 for (value = 0; value < 256; value++)
3326 if (isALNUM(value))
3327 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3328 }
8aa23a47
YO
3329 }
3330 break;
3331 case ALNUML:
3332 if (flags & SCF_DO_STCLASS_AND) {
3333 if (data->start_class->flags & ANYOF_LOCALE)
3334 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3335 }
3336 else {
3337 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3338 data->start_class->flags |= ANYOF_LOCALE;
3339 }
3340 break;
3341 case NALNUM:
3342 if (flags & SCF_DO_STCLASS_AND) {
3343 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3344 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3345 for (value = 0; value < 256; value++)
3346 if (isALNUM(value))
3347 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3348 }
3349 }
8aa23a47
YO
3350 else {
3351 if (data->start_class->flags & ANYOF_LOCALE)
3352 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3353 else {
3354 for (value = 0; value < 256; value++)
3355 if (!isALNUM(value))
3356 ANYOF_BITMAP_SET(data->start_class, value);
3357 }
653099ff 3358 }
8aa23a47
YO
3359 break;
3360 case NALNUML:
3361 if (flags & SCF_DO_STCLASS_AND) {
3362 if (data->start_class->flags & ANYOF_LOCALE)
3363 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3364 }
8aa23a47
YO
3365 else {
3366 data->start_class->flags |= ANYOF_LOCALE;
3367 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3368 }
3369 break;
3370 case SPACE:
3371 if (flags & SCF_DO_STCLASS_AND) {
3372 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3373 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3374 for (value = 0; value < 256; value++)
3375 if (!isSPACE(value))
3376 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3377 }
3378 }
8aa23a47
YO
3379 else {
3380 if (data->start_class->flags & ANYOF_LOCALE)
3381 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3382 else {
3383 for (value = 0; value < 256; value++)
3384 if (isSPACE(value))
3385 ANYOF_BITMAP_SET(data->start_class, value);
3386 }
653099ff 3387 }
8aa23a47
YO
3388 break;
3389 case SPACEL:
3390 if (flags & SCF_DO_STCLASS_AND) {
3391 if (data->start_class->flags & ANYOF_LOCALE)
3392 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3393 }
3394 else {
3395 data->start_class->flags |= ANYOF_LOCALE;
3396 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3397 }
3398 break;
3399 case NSPACE:
3400 if (flags & SCF_DO_STCLASS_AND) {
3401 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3402 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3403 for (value = 0; value < 256; value++)
3404 if (isSPACE(value))
3405 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3406 }
8aa23a47
YO
3407 }
3408 else {
3409 if (data->start_class->flags & ANYOF_LOCALE)
3410 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3411 else {
3412 for (value = 0; value < 256; value++)
3413 if (!isSPACE(value))
3414 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3415 }
3416 }
8aa23a47
YO
3417 break;
3418 case NSPACEL:
3419 if (flags & SCF_DO_STCLASS_AND) {
3420 if (data->start_class->flags & ANYOF_LOCALE) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3422 for (value = 0; value < 256; value++)
3423 if (!isSPACE(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
3425 }
653099ff 3426 }
8aa23a47
YO
3427 else {
3428 data->start_class->flags |= ANYOF_LOCALE;
3429 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3430 }
3431 break;
3432 case DIGIT:
3433 if (flags & SCF_DO_STCLASS_AND) {
3434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3435 for (value = 0; value < 256; value++)
3436 if (!isDIGIT(value))
3437 ANYOF_BITMAP_CLEAR(data->start_class, value);
3438 }
3439 else {
3440 if (data->start_class->flags & ANYOF_LOCALE)
3441 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3442 else {
3443 for (value = 0; value < 256; value++)
3444 if (isDIGIT(value))
3445 ANYOF_BITMAP_SET(data->start_class, value);
3446 }
3447 }
3448 break;
3449 case NDIGIT:
3450 if (flags & SCF_DO_STCLASS_AND) {
3451 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3452 for (value = 0; value < 256; value++)
3453 if (isDIGIT(value))
3454 ANYOF_BITMAP_CLEAR(data->start_class, value);
3455 }
3456 else {
3457 if (data->start_class->flags & ANYOF_LOCALE)
3458 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3459 else {
3460 for (value = 0; value < 256; value++)
3461 if (!isDIGIT(value))
3462 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3463 }
3464 }
8aa23a47
YO
3465 break;
3466 }
3467 if (flags & SCF_DO_STCLASS_OR)
3468 cl_and(data->start_class, and_withp);
3469 flags &= ~SCF_DO_STCLASS;
3470 }
3471 }
3472 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3473 data->flags |= (OP(scan) == MEOL
3474 ? SF_BEFORE_MEOL
3475 : SF_BEFORE_SEOL);
3476 }
3477 else if ( PL_regkind[OP(scan)] == BRANCHJ
3478 /* Lookbehind, or need to calculate parens/evals/stclass: */
3479 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3480 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3481 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3482 || OP(scan) == UNLESSM )
3483 {
3484 /* Negative Lookahead/lookbehind
3485 In this case we can't do fixed string optimisation.
3486 */
1de06328 3487
8aa23a47
YO
3488 I32 deltanext, minnext, fake = 0;
3489 regnode *nscan;
3490 struct regnode_charclass_class intrnl;
3491 int f = 0;
1de06328 3492
8aa23a47
YO
3493 data_fake.flags = 0;
3494 if (data) {
3495 data_fake.whilem_c = data->whilem_c;
3496 data_fake.last_closep = data->last_closep;
c277df42 3497 }
8aa23a47
YO
3498 else
3499 data_fake.last_closep = &fake;
58e23c8d 3500 data_fake.pos_delta = delta;
8aa23a47
YO
3501 if ( flags & SCF_DO_STCLASS && !scan->flags
3502 && OP(scan) == IFMATCH ) { /* Lookahead */
3503 cl_init(pRExC_state, &intrnl);
3504 data_fake.start_class = &intrnl;
3505 f |= SCF_DO_STCLASS_AND;
3506 }
3507 if (flags & SCF_WHILEM_VISITED_POS)
3508 f |= SCF_WHILEM_VISITED_POS;
3509 next = regnext(scan);
3510 nscan = NEXTOPER(NEXTOPER(scan));
3511 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3512 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3513 if (scan->flags) {
3514 if (deltanext) {
58e23c8d 3515 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3516 }
3517 else if (minnext > (I32)U8_MAX) {
58e23c8d 3518 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3519 }
3520 scan->flags = (U8)minnext;
3521 }
3522 if (data) {
3523 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3524 pars++;
3525 if (data_fake.flags & SF_HAS_EVAL)
3526 data->flags |= SF_HAS_EVAL;
3527 data->whilem_c = data_fake.whilem_c;
3528 }
3529 if (f & SCF_DO_STCLASS_AND) {
3530 const int was = (data->start_class->flags & ANYOF_EOS);
3531
3532 cl_and(data->start_class, &intrnl);
3533 if (was)
3534 data->start_class->flags |= ANYOF_EOS;
3535 }
cb434fcc 3536 }
8aa23a47
YO
3537#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3538 else {
3539 /* Positive Lookahead/lookbehind
3540 In this case we can do fixed string optimisation,
3541 but we must be careful about it. Note in the case of
3542 lookbehind the positions will be offset by the minimum
3543 length of the pattern, something we won't know about
3544 until after the recurse.
3545 */
3546 I32 deltanext, fake = 0;
3547 regnode *nscan;
3548 struct regnode_charclass_class intrnl;
3549 int f = 0;
3550 /* We use SAVEFREEPV so that when the full compile
3551 is finished perl will clean up the allocated
3552 minlens when its all done. This was we don't
3553 have to worry about freeing them when we know
3554 they wont be used, which would be a pain.
3555 */
3556 I32 *minnextp;
3557 Newx( minnextp, 1, I32 );
3558 SAVEFREEPV(minnextp);
3559
3560 if (data) {
3561 StructCopy(data, &data_fake, scan_data_t);
3562 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3563 f |= SCF_DO_SUBSTR;
3564 if (scan->flags)
3565 scan_commit(pRExC_state, &data_fake,minlenp);
3566 data_fake.last_found=newSVsv(data->last_found);
3567 }
3568 }
3569 else
3570 data_fake.last_closep = &fake;
3571 data_fake.flags = 0;
58e23c8d 3572 data_fake.pos_delta = delta;
8aa23a47
YO
3573 if (is_inf)
3574 data_fake.flags |= SF_IS_INF;
3575 if ( flags & SCF_DO_STCLASS && !scan->flags
3576 && OP(scan) == IFMATCH ) { /* Lookahead */
3577 cl_init(pRExC_state, &intrnl);
3578 data_fake.start_class = &intrnl;
3579 f |= SCF_DO_STCLASS_AND;
3580 }
3581 if (flags & SCF_WHILEM_VISITED_POS)
3582 f |= SCF_WHILEM_VISITED_POS;
3583 next = regnext(scan);
3584 nscan = NEXTOPER(NEXTOPER(scan));
3585
3586 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3587 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3588 if (scan->flags) {
3589 if (deltanext) {
58e23c8d 3590 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3591 }
3592 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3593 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3594 }
3595 scan->flags = (U8)*minnextp;
3596 }
3597
3598 *minnextp += min;
3599
3600 if (f & SCF_DO_STCLASS_AND) {
3601 const int was = (data->start_class->flags & ANYOF_EOS);
3602
3603 cl_and(data->start_class, &intrnl);
3604 if (was)
3605 data->start_class->flags |= ANYOF_EOS;
3606 }
3607 if (data) {
3608 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3609 pars++;
3610 if (data_fake.flags & SF_HAS_EVAL)
3611 data->flags |= SF_HAS_EVAL;
3612 data->whilem_c = data_fake.whilem_c;
3613 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3614 if (RExC_rx->minlen<*minnextp)
3615 RExC_rx->minlen=*minnextp;
3616 scan_commit(pRExC_state, &data_fake, minnextp);
3617 SvREFCNT_dec(data_fake.last_found);
3618
3619 if ( data_fake.minlen_fixed != minlenp )
3620 {
3621 data->offset_fixed= data_fake.offset_fixed;
3622 data->minlen_fixed= data_fake.minlen_fixed;
3623 data->lookbehind_fixed+= scan->flags;
3624 }
3625 if ( data_fake.minlen_float != minlenp )
3626 {
3627 data->minlen_float= data_fake.minlen_float;
3628 data->offset_float_min=data_fake.offset_float_min;
3629 data->offset_float_max=data_fake.offset_float_max;
3630 data->lookbehind_float+= scan->flags;
3631 }
3632 }
3633 }
3634
3635
40d049e4 3636 }
8aa23a47
YO
3637#endif
3638 }
3639 else if (OP(scan) == OPEN) {
3640 if (stopparen != (I32)ARG(scan))
3641 pars++;
3642 }
3643 else if (OP(scan) == CLOSE) {
3644 if (stopparen == (I32)ARG(scan)) {
3645 break;
3646 }
3647 if ((I32)ARG(scan) == is_par) {
3648 next = regnext(scan);
b515a41d 3649
8aa23a47
YO
3650 if ( next && (OP(next) != WHILEM) && next < last)
3651 is_par = 0; /* Disable optimization */
40d049e4 3652 }
8aa23a47
YO
3653 if (data)
3654 *(data->last_closep) = ARG(scan);
3655 }
3656 else if (OP(scan) == EVAL) {
c277df42
IZ
3657 if (data)
3658 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3659 }
3660 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3661 if (flags & SCF_DO_SUBSTR) {
3662 scan_commit(pRExC_state,data,minlenp);
3663 flags &= ~SCF_DO_SUBSTR;
40d049e4 3664 }
8aa23a47
YO
3665 if (data && OP(scan)==ACCEPT) {
3666 data->flags |= SCF_SEEN_ACCEPT;
3667 if (stopmin > min)
3668 stopmin = min;
e2e6a0f1 3669 }
8aa23a47
YO
3670 }
3671 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3672 {
0f5d15d6 3673 if (flags & SCF_DO_SUBSTR) {
1de06328 3674 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3675 data->longest = &(data->longest_float);
3676 }
3677 is_inf = is_inf_internal = 1;
653099ff 3678 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3679 cl_anything(pRExC_state, data->start_class);
96776eda 3680 flags &= ~SCF_DO_STCLASS;
8aa23a47 3681 }
58e23c8d 3682 else if (OP(scan) == GPOS) {
bbe252da 3683 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3684 !(delta || is_inf || (data && data->pos_delta)))
3685 {
bbe252da
YO
3686 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3687 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3688 if (RExC_rx->gofs < (U32)min)
3689 RExC_rx->gofs = min;
3690 } else {
bbe252da 3691 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3692 RExC_rx->gofs = 0;
3693 }
3694 }
786e8c11 3695#ifdef TRIE_STUDY_OPT
40d049e4 3696#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3697 else if (PL_regkind[OP(scan)] == TRIE) {
3698 /* NOTE - There is similar code to this block above for handling
3699 BRANCH nodes on the initial study. If you change stuff here
3700 check there too. */
3701 regnode *trie_node= scan;
3702 regnode *tail= regnext(scan);
f8fc2ecf 3703 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3704 I32 max1 = 0, min1 = I32_MAX;
3705 struct regnode_charclass_class accum;
3706
3707 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3708 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3709 if (flags & SCF_DO_STCLASS)
3710 cl_init_zero(pRExC_state, &accum);
3711
3712 if (!trie->jump) {
3713 min1= trie->minlen;
3714 max1= trie->maxlen;
3715 } else {
3716 const regnode *nextbranch= NULL;
3717 U32 word;
3718
3719 for ( word=1 ; word <= trie->wordcount ; word++)
3720 {
3721 I32 deltanext=0, minnext=0, f = 0, fake;
3722 struct regnode_charclass_class this_class;
3723
3724 data_fake.flags = 0;
3725 if (data) {
3726 data_fake.whilem_c = data->whilem_c;
3727 data_fake.last_closep = data->last_closep;
3728 }
3729 else
3730 data_fake.last_closep = &fake;
58e23c8d 3731 data_fake.pos_delta = delta;
8aa23a47
YO
3732 if (flags & SCF_DO_STCLASS) {
3733 cl_init(pRExC_state, &this_class);
3734 data_fake.start_class = &this_class;
3735 f = SCF_DO_STCLASS_AND;
3736 }
3737 if (flags & SCF_WHILEM_VISITED_POS)
3738 f |= SCF_WHILEM_VISITED_POS;
3739
3740 if (trie->jump[word]) {
3741 if (!nextbranch)
3742 nextbranch = trie_node + trie->jump[0];
3743 scan= trie_node + trie->jump[word];
3744 /* We go from the jump point to the branch that follows
3745 it. Note this means we need the vestigal unused branches
3746 even though they arent otherwise used.
3747 */
3748 minnext = study_chunk(pRExC_state, &scan, minlenp,
3749 &deltanext, (regnode *)nextbranch, &data_fake,
3750 stopparen, recursed, NULL, f,depth+1);
3751 }
3752 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3753 nextbranch= regnext((regnode*)nextbranch);
3754
3755 if (min1 > (I32)(minnext + trie->minlen))
3756 min1 = minnext + trie->minlen;
3757 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3758 max1 = minnext + deltanext + trie->maxlen;
3759 if (deltanext == I32_MAX)
3760 is_inf = is_inf_internal = 1;
3761
3762 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3763 pars++;
3764 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3765 if ( stopmin > min + min1)
3766 stopmin = min + min1;
3767 flags &= ~SCF_DO_SUBSTR;
3768 if (data)
3769 data->flags |= SCF_SEEN_ACCEPT;
3770 }
3771 if (data) {
3772 if (data_fake.flags & SF_HAS_EVAL)
3773 data->flags |= SF_HAS_EVAL;
3774 data->whilem_c = data_fake.whilem_c;
3775 }
3776 if (flags & SCF_DO_STCLASS)
3777 cl_or(pRExC_state, &accum, &this_class);
3778 }
3779 }
3780 if (flags & SCF_DO_SUBSTR) {
3781 data->pos_min += min1;
3782 data->pos_delta += max1 - min1;
3783 if (max1 != min1 || is_inf)
3784 data->longest = &(data->longest_float);
3785 }
3786 min += min1;
3787 delta += max1 - min1;
3788 if (flags & SCF_DO_STCLASS_OR) {
3789 cl_or(pRExC_state, data->start_class, &accum);
3790 if (min1) {
3791 cl_and(data->start_class, and_withp);
3792 flags &= ~SCF_DO_STCLASS;
3793 }
3794 }
3795 else if (flags & SCF_DO_STCLASS_AND) {
3796 if (min1) {
3797 cl_and(data->start_class, &accum);
3798 flags &= ~SCF_DO_STCLASS;
3799 }
3800 else {
3801 /* Switch to OR mode: cache the old value of
3802 * data->start_class */
3803 INIT_AND_WITHP;
3804 StructCopy(data->start_class, and_withp,
3805 struct regnode_charclass_class);
3806 flags &= ~SCF_DO_STCLASS_AND;
3807 StructCopy(&accum, data->start_class,
3808 struct regnode_charclass_class);
3809 flags |= SCF_DO_STCLASS_OR;
3810 data->start_class->flags |= ANYOF_EOS;
3811 }
3812 }
3813 scan= tail;
3814 continue;
3815 }
786e8c11 3816#else
8aa23a47 3817 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3818 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3819 U8*bang=NULL;
3820
3821 min += trie->minlen;
3822 delta += (trie->maxlen - trie->minlen);
3823 flags &= ~SCF_DO_STCLASS; /* xxx */
3824 if (flags & SCF_DO_SUBSTR) {
3825 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3826 data->pos_min += trie->minlen;
3827 data->pos_delta += (trie->maxlen - trie->minlen);
3828 if (trie->maxlen != trie->minlen)
3829 data->longest = &(data->longest_float);
3830 }
3831 if (trie->jump) /* no more substrings -- for now /grr*/
3832 flags &= ~SCF_DO_SUBSTR;
b515a41d 3833 }
8aa23a47
YO
3834#endif /* old or new */
3835#endif /* TRIE_STUDY_OPT */
3836 /* Else: zero-length, ignore. */
3837 scan = regnext(scan);
3838 }
3839 if (frame) {
3840 last = frame->last;
3841 scan = frame->next;
3842 stopparen = frame->stop;
3843 frame = frame->prev;
3844 goto fake_study_recurse;
c277df42
IZ
3845 }
3846
3847 finish:
8aa23a47
YO
3848 assert(!frame);
3849
c277df42 3850 *scanp = scan;
aca2d497 3851 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3852 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3853 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3854 if (is_par > (I32)U8_MAX)
c277df42
IZ
3855 is_par = 0;
3856 if (is_par && pars==1 && data) {
3857 data->flags |= SF_IN_PAR;
3858 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3859 }
3860 else if (pars && data) {
c277df42
IZ
3861 data->flags |= SF_HAS_PAR;
3862 data->flags &= ~SF_IN_PAR;
3863 }