This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add Regexp::Keep \K functionality to regex engine as well as add \v and \V, cleanup...
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 154#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 155#define RExC_emit (pRExC_state->emit)
fac92740 156#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
157#define RExC_naughty (pRExC_state->naughty)
158#define RExC_sawback (pRExC_state->sawback)
159#define RExC_seen (pRExC_state->seen)
160#define RExC_size (pRExC_state->size)
161#define RExC_npar (pRExC_state->npar)
e2e6a0f1 162#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
163#define RExC_extralen (pRExC_state->extralen)
164#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
165#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 166#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 167#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
168#define RExC_open_parens (pRExC_state->open_parens)
169#define RExC_close_parens (pRExC_state->close_parens)
170#define RExC_opend (pRExC_state->opend)
81714fb9 171#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
172#define RExC_recurse (pRExC_state->recurse)
173#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 174
a687059c
LW
175#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
176#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
177 ((*s) == '{' && regcurly(s)))
a687059c 178
35c8bce7
LW
179#ifdef SPSTART
180#undef SPSTART /* dratted cpp namespace... */
181#endif
a687059c
LW
182/*
183 * Flags to be passed up and down.
184 */
a687059c 185#define WORST 0 /* Worst case. */
821b33a5 186#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
187#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
188#define SPSTART 0x4 /* Starts with * or +. */
189#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 190
3dab1dad
YO
191#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
192
07be1b83
YO
193/* whether trie related optimizations are enabled */
194#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
195#define TRIE_STUDY_OPT
786e8c11 196#define FULL_TRIE_STUDY
07be1b83
YO
197#define TRIE_STCLASS
198#endif
1de06328
YO
199
200
40d049e4
YO
201
202#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
203#define PBITVAL(paren) (1 << ((paren) & 7))
204#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
205#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
206#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
207
208
1de06328
YO
209/* About scan_data_t.
210
211 During optimisation we recurse through the regexp program performing
212 various inplace (keyhole style) optimisations. In addition study_chunk
213 and scan_commit populate this data structure with information about
214 what strings MUST appear in the pattern. We look for the longest
215 string that must appear for at a fixed location, and we look for the
216 longest string that may appear at a floating location. So for instance
217 in the pattern:
218
219 /FOO[xX]A.*B[xX]BAR/
220
221 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
222 strings (because they follow a .* construct). study_chunk will identify
223 both FOO and BAR as being the longest fixed and floating strings respectively.
224
225 The strings can be composites, for instance
226
227 /(f)(o)(o)/
228
229 will result in a composite fixed substring 'foo'.
230
231 For each string some basic information is maintained:
232
233 - offset or min_offset
234 This is the position the string must appear at, or not before.
235 It also implicitly (when combined with minlenp) tells us how many
236 character must match before the string we are searching.
237 Likewise when combined with minlenp and the length of the string
238 tells us how many characters must appear after the string we have
239 found.
240
241 - max_offset
242 Only used for floating strings. This is the rightmost point that
243 the string can appear at. Ifset to I32 max it indicates that the
244 string can occur infinitely far to the right.
245
246 - minlenp
247 A pointer to the minimum length of the pattern that the string
248 was found inside. This is important as in the case of positive
249 lookahead or positive lookbehind we can have multiple patterns
250 involved. Consider
251
252 /(?=FOO).*F/
253
254 The minimum length of the pattern overall is 3, the minimum length
255 of the lookahead part is 3, but the minimum length of the part that
256 will actually match is 1. So 'FOO's minimum length is 3, but the
257 minimum length for the F is 1. This is important as the minimum length
258 is used to determine offsets in front of and behind the string being
259 looked for. Since strings can be composites this is the length of the
260 pattern at the time it was commited with a scan_commit. Note that
261 the length is calculated by study_chunk, so that the minimum lengths
262 are not known until the full pattern has been compiled, thus the
263 pointer to the value.
264
265 - lookbehind
266
267 In the case of lookbehind the string being searched for can be
268 offset past the start point of the final matching string.
269 If this value was just blithely removed from the min_offset it would
270 invalidate some of the calculations for how many chars must match
271 before or after (as they are derived from min_offset and minlen and
272 the length of the string being searched for).
273 When the final pattern is compiled and the data is moved from the
274 scan_data_t structure into the regexp structure the information
275 about lookbehind is factored in, with the information that would
276 have been lost precalculated in the end_shift field for the
277 associated string.
278
279 The fields pos_min and pos_delta are used to store the minimum offset
280 and the delta to the maximum offset at the current point in the pattern.
281
282*/
2c2d71f5
JH
283
284typedef struct scan_data_t {
1de06328
YO
285 /*I32 len_min; unused */
286 /*I32 len_delta; unused */
2c2d71f5
JH
287 I32 pos_min;
288 I32 pos_delta;
289 SV *last_found;
1de06328 290 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
291 I32 last_start_min;
292 I32 last_start_max;
1de06328
YO
293 SV **longest; /* Either &l_fixed, or &l_float. */
294 SV *longest_fixed; /* longest fixed string found in pattern */
295 I32 offset_fixed; /* offset where it starts */
296 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
297 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
298 SV *longest_float; /* longest floating string found in pattern */
299 I32 offset_float_min; /* earliest point in string it can appear */
300 I32 offset_float_max; /* latest point in string it can appear */
301 I32 *minlen_float; /* pointer to the minlen relevent to the string */
302 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
303 I32 flags;
304 I32 whilem_c;
cb434fcc 305 I32 *last_closep;
653099ff 306 struct regnode_charclass_class *start_class;
2c2d71f5
JH
307} scan_data_t;
308
a687059c 309/*
e50aee73 310 * Forward declarations for pregcomp()'s friends.
a687059c 311 */
a0d0e21e 312
27da23d5 313static const scan_data_t zero_scan_data =
1de06328 314 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
315
316#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
317#define SF_BEFORE_SEOL 0x0001
318#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
319#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
320#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
321
09b7f37c
CB
322#ifdef NO_UNARY_PLUS
323# define SF_FIX_SHIFT_EOL (0+2)
324# define SF_FL_SHIFT_EOL (0+4)
325#else
326# define SF_FIX_SHIFT_EOL (+2)
327# define SF_FL_SHIFT_EOL (+4)
328#endif
c277df42
IZ
329
330#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
331#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
332
333#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
334#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
335#define SF_IS_INF 0x0040
336#define SF_HAS_PAR 0x0080
337#define SF_IN_PAR 0x0100
338#define SF_HAS_EVAL 0x0200
339#define SCF_DO_SUBSTR 0x0400
653099ff
GS
340#define SCF_DO_STCLASS_AND 0x0800
341#define SCF_DO_STCLASS_OR 0x1000
342#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 343#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 344
786e8c11 345#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 346#define SCF_SEEN_ACCEPT 0x8000
07be1b83 347
eb160463 348#define UTF (RExC_utf8 != 0)
bbe252da
YO
349#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
350#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 351
ffc61ed2 352#define OOB_UNICODE 12345678
93733859 353#define OOB_NAMEDCLASS -1
b8c5462f 354
a0ed51b3
LW
355#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
356#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
357
8615cb43 358
b45f050a
JF
359/* length of regex to show in messages that don't mark a position within */
360#define RegexLengthToShowInErrorMessages 127
361
362/*
363 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
364 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
365 * op/pragma/warn/regcomp.
366 */
7253e4e3
RK
367#define MARKER1 "<-- HERE" /* marker as it appears in the description */
368#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 369
7253e4e3 370#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
371
372/*
373 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
374 * arg. Show regex, up to a maximum length. If it's too long, chop and add
375 * "...".
376 */
58e23c8d 377#define _FAIL(code) STMT_START { \
bfed75c6 378 const char *ellipses = ""; \
ccb2c380
MP
379 IV len = RExC_end - RExC_precomp; \
380 \
381 if (!SIZE_ONLY) \
382 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
383 if (len > RegexLengthToShowInErrorMessages) { \
384 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
385 len = RegexLengthToShowInErrorMessages - 10; \
386 ellipses = "..."; \
387 } \
58e23c8d 388 code; \
ccb2c380 389} STMT_END
8615cb43 390
58e23c8d
YO
391#define FAIL(msg) _FAIL( \
392 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
393 msg, (int)len, RExC_precomp, ellipses))
394
395#define FAIL2(msg,arg) _FAIL( \
396 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
397 arg, (int)len, RExC_precomp, ellipses))
398
b45f050a 399/*
b45f050a
JF
400 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
401 */
ccb2c380 402#define Simple_vFAIL(m) STMT_START { \
a28509cc 403 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
404 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
405 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
b45f050a
JF
407
408/*
409 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
410 */
ccb2c380
MP
411#define vFAIL(m) STMT_START { \
412 if (!SIZE_ONLY) \
413 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 Simple_vFAIL(m); \
415} STMT_END
b45f050a
JF
416
417/*
418 * Like Simple_vFAIL(), but accepts two arguments.
419 */
ccb2c380 420#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 421 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
422 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
423 (int)offset, RExC_precomp, RExC_precomp + offset); \
424} STMT_END
b45f050a
JF
425
426/*
427 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
428 */
ccb2c380
MP
429#define vFAIL2(m,a1) STMT_START { \
430 if (!SIZE_ONLY) \
431 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
432 Simple_vFAIL2(m, a1); \
433} STMT_END
b45f050a
JF
434
435
436/*
437 * Like Simple_vFAIL(), but accepts three arguments.
438 */
ccb2c380 439#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 440 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
441 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
442 (int)offset, RExC_precomp, RExC_precomp + offset); \
443} STMT_END
b45f050a
JF
444
445/*
446 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
447 */
ccb2c380
MP
448#define vFAIL3(m,a1,a2) STMT_START { \
449 if (!SIZE_ONLY) \
450 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
451 Simple_vFAIL3(m, a1, a2); \
452} STMT_END
b45f050a
JF
453
454/*
455 * Like Simple_vFAIL(), but accepts four arguments.
456 */
ccb2c380 457#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 458 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
459 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
460 (int)offset, RExC_precomp, RExC_precomp + offset); \
461} STMT_END
b45f050a 462
ccb2c380 463#define vWARN(loc,m) STMT_START { \
a28509cc 464 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
465 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
466 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
467} STMT_END
468
469#define vWARNdep(loc,m) STMT_START { \
a28509cc 470 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
471 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
472 "%s" REPORT_LOCATION, \
473 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476
477#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 478 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
479 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
480 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
481} STMT_END
482
483#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 484 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
485 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
486 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
487} STMT_END
488
489#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 490 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
491 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
493} STMT_END
494
495#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 496 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
9d1d55b5 500
8615cb43 501
cd439c50 502/* Allow for side effects in s */
ccb2c380
MP
503#define REGC(c,s) STMT_START { \
504 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
505} STMT_END
cd439c50 506
fac92740
MJD
507/* Macros for recording node offsets. 20001227 mjd@plover.com
508 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
509 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
510 * Element 0 holds the number n.
07be1b83 511 * Position is 1 indexed.
fac92740
MJD
512 */
513
ccb2c380
MP
514#define Set_Node_Offset_To_R(node,byte) STMT_START { \
515 if (! SIZE_ONLY) { \
516 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 517 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 518 if((node) < 0) { \
551405c4 519 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
520 } else { \
521 RExC_offsets[2*(node)-1] = (byte); \
522 } \
523 } \
524} STMT_END
525
526#define Set_Node_Offset(node,byte) \
527 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
528#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
529
530#define Set_Node_Length_To_R(node,len) STMT_START { \
531 if (! SIZE_ONLY) { \
532 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 533 __LINE__, (int)(node), (int)(len))); \
ccb2c380 534 if((node) < 0) { \
551405c4 535 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
536 } else { \
537 RExC_offsets[2*(node)] = (len); \
538 } \
539 } \
540} STMT_END
541
542#define Set_Node_Length(node,len) \
543 Set_Node_Length_To_R((node)-RExC_emit_start, len)
544#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
545#define Set_Node_Cur_Length(node) \
546 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
547
548/* Get offsets and lengths */
549#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
550#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
551
07be1b83
YO
552#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
553 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
554 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
555} STMT_END
556
557
558#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
559#define EXPERIMENTAL_INPLACESCAN
560#endif
561
304ee84b
YO
562#define DEBUG_STUDYDATA(str,data,depth) \
563DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 564 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
565 "%*s" str "Pos:%"IVdf"/%"IVdf \
566 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
567 (int)(depth)*2, "", \
568 (IV)((data)->pos_min), \
569 (IV)((data)->pos_delta), \
304ee84b 570 (UV)((data)->flags), \
1de06328 571 (IV)((data)->whilem_c), \
304ee84b
YO
572 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
573 is_inf ? "INF " : "" \
1de06328
YO
574 ); \
575 if ((data)->last_found) \
576 PerlIO_printf(Perl_debug_log, \
577 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
578 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
579 SvPVX_const((data)->last_found), \
580 (IV)((data)->last_end), \
581 (IV)((data)->last_start_min), \
582 (IV)((data)->last_start_max), \
583 ((data)->longest && \
584 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
585 SvPVX_const((data)->longest_fixed), \
586 (IV)((data)->offset_fixed), \
587 ((data)->longest && \
588 (data)->longest==&((data)->longest_float)) ? "*" : "", \
589 SvPVX_const((data)->longest_float), \
590 (IV)((data)->offset_float_min), \
591 (IV)((data)->offset_float_max) \
592 ); \
593 PerlIO_printf(Perl_debug_log,"\n"); \
594});
595
acfe0abc 596static void clear_re(pTHX_ void *r);
4327152a 597
653099ff 598/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 599 Update the longest found anchored substring and the longest found
653099ff
GS
600 floating substrings if needed. */
601
4327152a 602STATIC void
304ee84b 603S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 604{
e1ec3a88
AL
605 const STRLEN l = CHR_SVLEN(data->last_found);
606 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 607 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 608
c277df42 609 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 610 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
611 if (*data->longest == data->longest_fixed) {
612 data->offset_fixed = l ? data->last_start_min : data->pos_min;
613 if (data->flags & SF_BEFORE_EOL)
b81d288d 614 data->flags
c277df42
IZ
615 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
616 else
617 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
618 data->minlen_fixed=minlenp;
619 data->lookbehind_fixed=0;
a0ed51b3 620 }
304ee84b 621 else { /* *data->longest == data->longest_float */
c277df42 622 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
623 data->offset_float_max = (l
624 ? data->last_start_max
c277df42 625 : data->pos_min + data->pos_delta);
304ee84b 626 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 627 data->offset_float_max = I32_MAX;
c277df42 628 if (data->flags & SF_BEFORE_EOL)
b81d288d 629 data->flags
c277df42
IZ
630 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
631 else
632 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
633 data->minlen_float=minlenp;
634 data->lookbehind_float=0;
c277df42
IZ
635 }
636 }
637 SvCUR_set(data->last_found, 0);
0eda9292 638 {
a28509cc 639 SV * const sv = data->last_found;
097eb12c
AL
640 if (SvUTF8(sv) && SvMAGICAL(sv)) {
641 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
642 if (mg)
643 mg->mg_len = 0;
644 }
0eda9292 645 }
c277df42
IZ
646 data->last_end = -1;
647 data->flags &= ~SF_BEFORE_EOL;
304ee84b 648 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
649}
650
653099ff
GS
651/* Can match anything (initialization) */
652STATIC void
097eb12c 653S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 654{
653099ff 655 ANYOF_CLASS_ZERO(cl);
f8bef550 656 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 657 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
658 if (LOC)
659 cl->flags |= ANYOF_LOCALE;
660}
661
662/* Can match anything (initialization) */
663STATIC int
5f66b61c 664S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
665{
666 int value;
667
aaa51d5e 668 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
669 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
670 return 1;
1aa99e6b
IH
671 if (!(cl->flags & ANYOF_UNICODE_ALL))
672 return 0;
10edeb5d 673 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 674 return 0;
653099ff
GS
675 return 1;
676}
677
678/* Can match anything (initialization) */
679STATIC void
097eb12c 680S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 681{
8ecf7187 682 Zero(cl, 1, struct regnode_charclass_class);
653099ff 683 cl->type = ANYOF;
830247a4 684 cl_anything(pRExC_state, cl);
653099ff
GS
685}
686
687STATIC void
097eb12c 688S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 689{
8ecf7187 690 Zero(cl, 1, struct regnode_charclass_class);
653099ff 691 cl->type = ANYOF;
830247a4 692 cl_anything(pRExC_state, cl);
653099ff
GS
693 if (LOC)
694 cl->flags |= ANYOF_LOCALE;
695}
696
697/* 'And' a given class with another one. Can create false positives */
698/* We assume that cl is not inverted */
699STATIC void
5f66b61c 700S_cl_and(struct regnode_charclass_class *cl,
a28509cc 701 const struct regnode_charclass_class *and_with)
653099ff 702{
40d049e4
YO
703
704 assert(and_with->type == ANYOF);
653099ff
GS
705 if (!(and_with->flags & ANYOF_CLASS)
706 && !(cl->flags & ANYOF_CLASS)
707 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
708 && !(and_with->flags & ANYOF_FOLD)
709 && !(cl->flags & ANYOF_FOLD)) {
710 int i;
711
712 if (and_with->flags & ANYOF_INVERT)
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= ~and_with->bitmap[i];
715 else
716 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
717 cl->bitmap[i] &= and_with->bitmap[i];
718 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
719 if (!(and_with->flags & ANYOF_EOS))
720 cl->flags &= ~ANYOF_EOS;
1aa99e6b 721
14ebb1a2
JH
722 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
723 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
724 cl->flags &= ~ANYOF_UNICODE_ALL;
725 cl->flags |= ANYOF_UNICODE;
726 ARG_SET(cl, ARG(and_with));
727 }
14ebb1a2
JH
728 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
731 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
732 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 733 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
734}
735
736/* 'OR' a given class with another one. Can create false positives */
737/* We assume that cl is not inverted */
738STATIC void
097eb12c 739S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 740{
653099ff
GS
741 if (or_with->flags & ANYOF_INVERT) {
742 /* We do not use
743 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
744 * <= (B1 | !B2) | (CL1 | !CL2)
745 * which is wasteful if CL2 is small, but we ignore CL2:
746 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
747 * XXXX Can we handle case-fold? Unclear:
748 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
749 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
750 */
751 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
752 && !(or_with->flags & ANYOF_FOLD)
753 && !(cl->flags & ANYOF_FOLD) ) {
754 int i;
755
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= ~or_with->bitmap[i];
758 } /* XXXX: logic is complicated otherwise */
759 else {
830247a4 760 cl_anything(pRExC_state, cl);
653099ff
GS
761 }
762 } else {
763 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
764 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 765 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
766 || (cl->flags & ANYOF_FOLD)) ) {
767 int i;
768
769 /* OR char bitmap and class bitmap separately */
770 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
771 cl->bitmap[i] |= or_with->bitmap[i];
772 if (or_with->flags & ANYOF_CLASS) {
773 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
774 cl->classflags[i] |= or_with->classflags[i];
775 cl->flags |= ANYOF_CLASS;
776 }
777 }
778 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 779 cl_anything(pRExC_state, cl);
653099ff
GS
780 }
781 }
782 if (or_with->flags & ANYOF_EOS)
783 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
784
785 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
786 ARG(cl) != ARG(or_with)) {
787 cl->flags |= ANYOF_UNICODE_ALL;
788 cl->flags &= ~ANYOF_UNICODE;
789 }
790 if (or_with->flags & ANYOF_UNICODE_ALL) {
791 cl->flags |= ANYOF_UNICODE_ALL;
792 cl->flags &= ~ANYOF_UNICODE;
793 }
653099ff
GS
794}
795
a3621e74
YO
796#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
797#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
798#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
799#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
800
3dab1dad
YO
801
802#ifdef DEBUGGING
07be1b83 803/*
2b8b4781
NC
804 dump_trie(trie,widecharmap,revcharmap)
805 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
806 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
807
808 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
809 The _interim_ variants are used for debugging the interim
810 tables that are used to generate the final compressed
811 representation which is what dump_trie expects.
812
3dab1dad
YO
813 Part of the reason for their existance is to provide a form
814 of documentation as to how the different representations function.
07be1b83
YO
815
816*/
3dab1dad
YO
817
818/*
3dab1dad
YO
819 Dumps the final compressed table form of the trie to Perl_debug_log.
820 Used for debugging make_trie().
821*/
822
823STATIC void
2b8b4781
NC
824S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
825 AV *revcharmap, U32 depth)
3dab1dad
YO
826{
827 U32 state;
ab3bbdeb 828 SV *sv=sv_newmortal();
55eed653 829 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
830 GET_RE_DEBUG_FLAGS_DECL;
831
ab3bbdeb 832
3dab1dad
YO
833 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
834 (int)depth * 2 + 2,"",
835 "Match","Base","Ofs" );
836
837 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 838 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 839 if ( tmp ) {
ab3bbdeb
YO
840 PerlIO_printf( Perl_debug_log, "%*s",
841 colwidth,
ddc5bc0f 842 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
843 PL_colors[0], PL_colors[1],
844 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
845 PERL_PV_ESCAPE_FIRSTCHAR
846 )
847 );
3dab1dad
YO
848 }
849 }
850 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
851 (int)depth * 2 + 2,"");
852
853 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 854 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
855 PerlIO_printf( Perl_debug_log, "\n");
856
1e2e3d02 857 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 858 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
859
860 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
861
862 if ( trie->states[ state ].wordnum ) {
863 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
864 } else {
865 PerlIO_printf( Perl_debug_log, "%6s", "" );
866 }
867
868 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
869
870 if ( base ) {
871 U32 ofs = 0;
872
873 while( ( base + ofs < trie->uniquecharcount ) ||
874 ( base + ofs - trie->uniquecharcount < trie->lasttrans
875 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
876 ofs++;
877
878 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
879
880 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
881 if ( ( base + ofs >= trie->uniquecharcount ) &&
882 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
883 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
884 {
ab3bbdeb
YO
885 PerlIO_printf( Perl_debug_log, "%*"UVXf,
886 colwidth,
3dab1dad
YO
887 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
888 } else {
ab3bbdeb 889 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
890 }
891 }
892
893 PerlIO_printf( Perl_debug_log, "]");
894
895 }
896 PerlIO_printf( Perl_debug_log, "\n" );
897 }
898}
899/*
3dab1dad
YO
900 Dumps a fully constructed but uncompressed trie in list form.
901 List tries normally only are used for construction when the number of
902 possible chars (trie->uniquecharcount) is very high.
903 Used for debugging make_trie().
904*/
905STATIC void
55eed653 906S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
907 HV *widecharmap, AV *revcharmap, U32 next_alloc,
908 U32 depth)
3dab1dad
YO
909{
910 U32 state;
ab3bbdeb 911 SV *sv=sv_newmortal();
55eed653 912 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
913 GET_RE_DEBUG_FLAGS_DECL;
914 /* print out the table precompression. */
ab3bbdeb
YO
915 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
916 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
917 "------:-----+-----------------\n" );
3dab1dad
YO
918
919 for( state=1 ; state < next_alloc ; state ++ ) {
920 U16 charid;
921
ab3bbdeb 922 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
923 (int)depth * 2 + 2,"", (UV)state );
924 if ( ! trie->states[ state ].wordnum ) {
925 PerlIO_printf( Perl_debug_log, "%5s| ","");
926 } else {
927 PerlIO_printf( Perl_debug_log, "W%4x| ",
928 trie->states[ state ].wordnum
929 );
930 }
931 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 932 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
933 if ( tmp ) {
934 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
935 colwidth,
ddc5bc0f 936 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
937 PL_colors[0], PL_colors[1],
938 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
939 PERL_PV_ESCAPE_FIRSTCHAR
940 ) ,
1e2e3d02
YO
941 TRIE_LIST_ITEM(state,charid).forid,
942 (UV)TRIE_LIST_ITEM(state,charid).newstate
943 );
944 if (!(charid % 10))
664e119d
RGS
945 PerlIO_printf(Perl_debug_log, "\n%*s| ",
946 (int)((depth * 2) + 14), "");
1e2e3d02 947 }
ab3bbdeb
YO
948 }
949 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
950 }
951}
952
953/*
3dab1dad
YO
954 Dumps a fully constructed but uncompressed trie in table form.
955 This is the normal DFA style state transition table, with a few
956 twists to facilitate compression later.
957 Used for debugging make_trie().
958*/
959STATIC void
55eed653 960S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
961 HV *widecharmap, AV *revcharmap, U32 next_alloc,
962 U32 depth)
3dab1dad
YO
963{
964 U32 state;
965 U16 charid;
ab3bbdeb 966 SV *sv=sv_newmortal();
55eed653 967 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
968 GET_RE_DEBUG_FLAGS_DECL;
969
970 /*
971 print out the table precompression so that we can do a visual check
972 that they are identical.
973 */
974
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
976
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 978 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 979 if ( tmp ) {
ab3bbdeb
YO
980 PerlIO_printf( Perl_debug_log, "%*s",
981 colwidth,
ddc5bc0f 982 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
983 PL_colors[0], PL_colors[1],
984 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
985 PERL_PV_ESCAPE_FIRSTCHAR
986 )
987 );
3dab1dad
YO
988 }
989 }
990
991 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
992
993 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 994 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
995 }
996
997 PerlIO_printf( Perl_debug_log, "\n" );
998
999 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1000
1001 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1002 (int)depth * 2 + 2,"",
1003 (UV)TRIE_NODENUM( state ) );
1004
1005 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1006 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1007 if (v)
1008 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1009 else
1010 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1011 }
1012 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1013 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1014 } else {
1015 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1016 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1017 }
1018 }
07be1b83 1019}
3dab1dad
YO
1020
1021#endif
1022
786e8c11
YO
1023/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1024 startbranch: the first branch in the whole branch sequence
1025 first : start branch of sequence of branch-exact nodes.
1026 May be the same as startbranch
1027 last : Thing following the last branch.
1028 May be the same as tail.
1029 tail : item following the branch sequence
1030 count : words in the sequence
1031 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1032 depth : indent depth
3dab1dad 1033
786e8c11 1034Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1035
786e8c11
YO
1036A trie is an N'ary tree where the branches are determined by digital
1037decomposition of the key. IE, at the root node you look up the 1st character and
1038follow that branch repeat until you find the end of the branches. Nodes can be
1039marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1040
786e8c11 1041 /he|she|his|hers/
72f13be8 1042
786e8c11
YO
1043would convert into the following structure. Numbers represent states, letters
1044following numbers represent valid transitions on the letter from that state, if
1045the number is in square brackets it represents an accepting state, otherwise it
1046will be in parenthesis.
07be1b83 1047
786e8c11
YO
1048 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1049 | |
1050 | (2)
1051 | |
1052 (1) +-i->(6)-+-s->[7]
1053 |
1054 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1055
786e8c11
YO
1056 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1057
1058This shows that when matching against the string 'hers' we will begin at state 1
1059read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1060then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1061is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1062single traverse. We store a mapping from accepting to state to which word was
1063matched, and then when we have multiple possibilities we try to complete the
1064rest of the regex in the order in which they occured in the alternation.
1065
1066The only prior NFA like behaviour that would be changed by the TRIE support is
1067the silent ignoring of duplicate alternations which are of the form:
1068
1069 / (DUPE|DUPE) X? (?{ ... }) Y /x
1070
1071Thus EVAL blocks follwing a trie may be called a different number of times with
1072and without the optimisation. With the optimisations dupes will be silently
1073ignored. This inconsistant behaviour of EVAL type nodes is well established as
1074the following demonstrates:
1075
1076 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1077
1078which prints out 'word' three times, but
1079
1080 'words'=~/(word|word|word)(?{ print $1 })S/
1081
1082which doesnt print it out at all. This is due to other optimisations kicking in.
1083
1084Example of what happens on a structural level:
1085
1086The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1087
1088 1: CURLYM[1] {1,32767}(18)
1089 5: BRANCH(8)
1090 6: EXACT <ac>(16)
1091 8: BRANCH(11)
1092 9: EXACT <ad>(16)
1093 11: BRANCH(14)
1094 12: EXACT <ab>(16)
1095 16: SUCCEED(0)
1096 17: NOTHING(18)
1097 18: END(0)
1098
1099This would be optimizable with startbranch=5, first=5, last=16, tail=16
1100and should turn into:
1101
1102 1: CURLYM[1] {1,32767}(18)
1103 5: TRIE(16)
1104 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1105 <ac>
1106 <ad>
1107 <ab>
1108 16: SUCCEED(0)
1109 17: NOTHING(18)
1110 18: END(0)
1111
1112Cases where tail != last would be like /(?foo|bar)baz/:
1113
1114 1: BRANCH(4)
1115 2: EXACT <foo>(8)
1116 4: BRANCH(7)
1117 5: EXACT <bar>(8)
1118 7: TAIL(8)
1119 8: EXACT <baz>(10)
1120 10: END(0)
1121
1122which would be optimizable with startbranch=1, first=1, last=7, tail=8
1123and would end up looking like:
1124
1125 1: TRIE(8)
1126 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1127 <foo>
1128 <bar>
1129 7: TAIL(8)
1130 8: EXACT <baz>(10)
1131 10: END(0)
1132
1133 d = uvuni_to_utf8_flags(d, uv, 0);
1134
1135is the recommended Unicode-aware way of saying
1136
1137 *(d++) = uv;
1138*/
1139
1e2e3d02 1140#define TRIE_STORE_REVCHAR \
786e8c11 1141 STMT_START { \
1e2e3d02 1142 SV *tmp = newSVpvs(""); \
786e8c11 1143 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1144 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1145 av_push( revcharmap, tmp ); \
786e8c11
YO
1146 } STMT_END
1147
1148#define TRIE_READ_CHAR STMT_START { \
1149 wordlen++; \
1150 if ( UTF ) { \
1151 if ( folder ) { \
1152 if ( foldlen > 0 ) { \
1153 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1154 foldlen -= len; \
1155 scan += len; \
1156 len = 0; \
1157 } else { \
1158 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1159 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1160 foldlen -= UNISKIP( uvc ); \
1161 scan = foldbuf + UNISKIP( uvc ); \
1162 } \
1163 } else { \
1164 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1165 } \
1166 } else { \
1167 uvc = (U32)*uc; \
1168 len = 1; \
1169 } \
1170} STMT_END
1171
1172
1173
1174#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1175 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1176 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1177 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1178 } \
1179 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1180 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1181 TRIE_LIST_CUR( state )++; \
1182} STMT_END
07be1b83 1183
786e8c11
YO
1184#define TRIE_LIST_NEW(state) STMT_START { \
1185 Newxz( trie->states[ state ].trans.list, \
1186 4, reg_trie_trans_le ); \
1187 TRIE_LIST_CUR( state ) = 1; \
1188 TRIE_LIST_LEN( state ) = 4; \
1189} STMT_END
07be1b83 1190
786e8c11
YO
1191#define TRIE_HANDLE_WORD(state) STMT_START { \
1192 U16 dupe= trie->states[ state ].wordnum; \
1193 regnode * const noper_next = regnext( noper ); \
1194 \
1195 if (trie->wordlen) \
1196 trie->wordlen[ curword ] = wordlen; \
1197 DEBUG_r({ \
1198 /* store the word for dumping */ \
1199 SV* tmp; \
1200 if (OP(noper) != NOTHING) \
1201 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1202 else \
1203 tmp = newSVpvn( "", 0 ); \
1204 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1205 av_push( trie_words, tmp ); \
786e8c11
YO
1206 }); \
1207 \
1208 curword++; \
1209 \
1210 if ( noper_next < tail ) { \
1211 if (!trie->jump) \
c944940b 1212 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1213 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1214 if (!jumper) \
1215 jumper = noper_next; \
1216 if (!nextbranch) \
1217 nextbranch= regnext(cur); \
1218 } \
1219 \
1220 if ( dupe ) { \
1221 /* So it's a dupe. This means we need to maintain a */\
1222 /* linked-list from the first to the next. */\
1223 /* we only allocate the nextword buffer when there */\
1224 /* a dupe, so first time we have to do the allocation */\
1225 if (!trie->nextword) \
c944940b 1226 trie->nextword = (U16 *) \
446bd890 1227 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1228 while ( trie->nextword[dupe] ) \
1229 dupe= trie->nextword[dupe]; \
1230 trie->nextword[dupe]= curword; \
1231 } else { \
1232 /* we haven't inserted this word yet. */ \
1233 trie->states[ state ].wordnum = curword; \
1234 } \
1235} STMT_END
07be1b83 1236
3dab1dad 1237
786e8c11
YO
1238#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1239 ( ( base + charid >= ucharcount \
1240 && base + charid < ubound \
1241 && state == trie->trans[ base - ucharcount + charid ].check \
1242 && trie->trans[ base - ucharcount + charid ].next ) \
1243 ? trie->trans[ base - ucharcount + charid ].next \
1244 : ( state==1 ? special : 0 ) \
1245 )
3dab1dad 1246
786e8c11
YO
1247#define MADE_TRIE 1
1248#define MADE_JUMP_TRIE 2
1249#define MADE_EXACT_TRIE 4
3dab1dad 1250
a3621e74 1251STATIC I32
786e8c11 1252S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1253{
27da23d5 1254 dVAR;
a3621e74
YO
1255 /* first pass, loop through and scan words */
1256 reg_trie_data *trie;
55eed653 1257 HV *widecharmap = NULL;
2b8b4781 1258 AV *revcharmap = newAV();
a3621e74 1259 regnode *cur;
9f7f3913 1260 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1261 STRLEN len = 0;
1262 UV uvc = 0;
1263 U16 curword = 0;
1264 U32 next_alloc = 0;
786e8c11
YO
1265 regnode *jumper = NULL;
1266 regnode *nextbranch = NULL;
7f69552c 1267 regnode *convert = NULL;
a3621e74 1268 /* we just use folder as a flag in utf8 */
e1ec3a88 1269 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1270 ? PL_fold
1271 : ( flags == EXACTFL
1272 ? PL_fold_locale
1273 : NULL
1274 )
1275 );
1276
2b8b4781
NC
1277#ifdef DEBUGGING
1278 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1279 AV *trie_words = NULL;
1280 /* along with revcharmap, this only used during construction but both are
1281 * useful during debugging so we store them in the struct when debugging.
8e11feef 1282 */
2b8b4781
NC
1283#else
1284 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1285 STRLEN trie_charcount=0;
3dab1dad 1286#endif
2b8b4781 1287 SV *re_trie_maxbuff;
a3621e74 1288 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1289#ifndef DEBUGGING
1290 PERL_UNUSED_ARG(depth);
1291#endif
a3621e74 1292
c944940b 1293 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1294 trie->refcount = 1;
3dab1dad 1295 trie->startstate = 1;
786e8c11 1296 trie->wordcount = word_count;
f8fc2ecf 1297 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1298 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1299 if (!(UTF && folder))
c944940b 1300 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1301 DEBUG_r({
2b8b4781 1302 trie_words = newAV();
a3621e74 1303 });
a3621e74 1304
0111c4fd 1305 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1306 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1307 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1308 }
3dab1dad
YO
1309 DEBUG_OPTIMISE_r({
1310 PerlIO_printf( Perl_debug_log,
786e8c11 1311 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1312 (int)depth * 2 + 2, "",
1313 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1314 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1315 (int)depth);
3dab1dad 1316 });
7f69552c
YO
1317
1318 /* Find the node we are going to overwrite */
1319 if ( first == startbranch && OP( last ) != BRANCH ) {
1320 /* whole branch chain */
1321 convert = first;
1322 } else {
1323 /* branch sub-chain */
1324 convert = NEXTOPER( first );
1325 }
1326
a3621e74
YO
1327 /* -- First loop and Setup --
1328
1329 We first traverse the branches and scan each word to determine if it
1330 contains widechars, and how many unique chars there are, this is
1331 important as we have to build a table with at least as many columns as we
1332 have unique chars.
1333
1334 We use an array of integers to represent the character codes 0..255
1335 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1336 native representation of the character value as the key and IV's for the
1337 coded index.
1338
1339 *TODO* If we keep track of how many times each character is used we can
1340 remap the columns so that the table compression later on is more
1341 efficient in terms of memory by ensuring most common value is in the
1342 middle and the least common are on the outside. IMO this would be better
1343 than a most to least common mapping as theres a decent chance the most
1344 common letter will share a node with the least common, meaning the node
1345 will not be compressable. With a middle is most common approach the worst
1346 case is when we have the least common nodes twice.
1347
1348 */
1349
a3621e74 1350 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1351 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1352 const U8 *uc = (U8*)STRING( noper );
a28509cc 1353 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1354 STRLEN foldlen = 0;
1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1356 const U8 *scan = (U8*)NULL;
07be1b83 1357 U32 wordlen = 0; /* required init */
3dab1dad 1358 STRLEN chars=0;
a3621e74 1359
3dab1dad
YO
1360 if (OP(noper) == NOTHING) {
1361 trie->minlen= 0;
1362 continue;
1363 }
1364 if (trie->bitmap) {
1365 TRIE_BITMAP_SET(trie,*uc);
1366 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1367 }
a3621e74 1368 for ( ; uc < e ; uc += len ) {
3dab1dad 1369 TRIE_CHARCOUNT(trie)++;
a3621e74 1370 TRIE_READ_CHAR;
3dab1dad 1371 chars++;
a3621e74
YO
1372 if ( uvc < 256 ) {
1373 if ( !trie->charmap[ uvc ] ) {
1374 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1375 if ( folder )
1376 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1377 TRIE_STORE_REVCHAR;
a3621e74
YO
1378 }
1379 } else {
1380 SV** svpp;
55eed653
NC
1381 if ( !widecharmap )
1382 widecharmap = newHV();
a3621e74 1383
55eed653 1384 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1385
1386 if ( !svpp )
e4584336 1387 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1388
1389 if ( !SvTRUE( *svpp ) ) {
1390 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1391 TRIE_STORE_REVCHAR;
a3621e74
YO
1392 }
1393 }
1394 }
3dab1dad
YO
1395 if( cur == first ) {
1396 trie->minlen=chars;
1397 trie->maxlen=chars;
1398 } else if (chars < trie->minlen) {
1399 trie->minlen=chars;
1400 } else if (chars > trie->maxlen) {
1401 trie->maxlen=chars;
1402 }
1403
a3621e74
YO
1404 } /* end first pass */
1405 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1406 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1407 (int)depth * 2 + 2,"",
55eed653 1408 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1409 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1410 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1411 );
c944940b 1412 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1413
1414 /*
1415 We now know what we are dealing with in terms of unique chars and
1416 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1417 representation using a flat table will take. If it's over a reasonable
1418 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1419 conservative but potentially much slower representation using an array
1420 of lists.
1421
1422 At the end we convert both representations into the same compressed
1423 form that will be used in regexec.c for matching with. The latter
1424 is a form that cannot be used to construct with but has memory
1425 properties similar to the list form and access properties similar
1426 to the table form making it both suitable for fast searches and
1427 small enough that its feasable to store for the duration of a program.
1428
1429 See the comment in the code where the compressed table is produced
1430 inplace from the flat tabe representation for an explanation of how
1431 the compression works.
1432
1433 */
1434
1435
3dab1dad 1436 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1437 /*
1438 Second Pass -- Array Of Lists Representation
1439
1440 Each state will be represented by a list of charid:state records
1441 (reg_trie_trans_le) the first such element holds the CUR and LEN
1442 points of the allocated array. (See defines above).
1443
1444 We build the initial structure using the lists, and then convert
1445 it into the compressed table form which allows faster lookups
1446 (but cant be modified once converted).
a3621e74
YO
1447 */
1448
a3621e74
YO
1449 STRLEN transcount = 1;
1450
1e2e3d02
YO
1451 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1452 "%*sCompiling trie using list compiler\n",
1453 (int)depth * 2 + 2, ""));
446bd890 1454
c944940b
JH
1455 trie->states = (reg_trie_state *)
1456 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1457 sizeof(reg_trie_state) );
a3621e74
YO
1458 TRIE_LIST_NEW(1);
1459 next_alloc = 2;
1460
1461 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1462
c445ea15
AL
1463 regnode * const noper = NEXTOPER( cur );
1464 U8 *uc = (U8*)STRING( noper );
1465 const U8 * const e = uc + STR_LEN( noper );
1466 U32 state = 1; /* required init */
1467 U16 charid = 0; /* sanity init */
1468 U8 *scan = (U8*)NULL; /* sanity init */
1469 STRLEN foldlen = 0; /* required init */
07be1b83 1470 U32 wordlen = 0; /* required init */
c445ea15
AL
1471 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1472
3dab1dad 1473 if (OP(noper) != NOTHING) {
786e8c11 1474 for ( ; uc < e ; uc += len ) {
c445ea15 1475
786e8c11 1476 TRIE_READ_CHAR;
c445ea15 1477
786e8c11
YO
1478 if ( uvc < 256 ) {
1479 charid = trie->charmap[ uvc ];
c445ea15 1480 } else {
55eed653 1481 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1482 if ( !svpp ) {
1483 charid = 0;
1484 } else {
1485 charid=(U16)SvIV( *svpp );
1486 }
c445ea15 1487 }
786e8c11
YO
1488 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1489 if ( charid ) {
a3621e74 1490
786e8c11
YO
1491 U16 check;
1492 U32 newstate = 0;
a3621e74 1493
786e8c11
YO
1494 charid--;
1495 if ( !trie->states[ state ].trans.list ) {
1496 TRIE_LIST_NEW( state );
c445ea15 1497 }
786e8c11
YO
1498 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1499 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1500 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1501 break;
1502 }
1503 }
1504 if ( ! newstate ) {
1505 newstate = next_alloc++;
1506 TRIE_LIST_PUSH( state, charid, newstate );
1507 transcount++;
1508 }
1509 state = newstate;
1510 } else {
1511 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1512 }
a28509cc 1513 }
c445ea15 1514 }
3dab1dad 1515 TRIE_HANDLE_WORD(state);
a3621e74
YO
1516
1517 } /* end second pass */
1518
1e2e3d02
YO
1519 /* next alloc is the NEXT state to be allocated */
1520 trie->statecount = next_alloc;
c944940b
JH
1521 trie->states = (reg_trie_state *)
1522 PerlMemShared_realloc( trie->states,
1523 next_alloc
1524 * sizeof(reg_trie_state) );
a3621e74 1525
3dab1dad 1526 /* and now dump it out before we compress it */
2b8b4781
NC
1527 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1528 revcharmap, next_alloc,
1529 depth+1)
1e2e3d02 1530 );
a3621e74 1531
c944940b
JH
1532 trie->trans = (reg_trie_trans *)
1533 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1534 {
1535 U32 state;
a3621e74
YO
1536 U32 tp = 0;
1537 U32 zp = 0;
1538
1539
1540 for( state=1 ; state < next_alloc ; state ++ ) {
1541 U32 base=0;
1542
1543 /*
1544 DEBUG_TRIE_COMPILE_MORE_r(
1545 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1546 );
1547 */
1548
1549 if (trie->states[state].trans.list) {
1550 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1551 U16 maxid=minid;
a28509cc 1552 U16 idx;
a3621e74
YO
1553
1554 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1555 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1556 if ( forid < minid ) {
1557 minid=forid;
1558 } else if ( forid > maxid ) {
1559 maxid=forid;
1560 }
a3621e74
YO
1561 }
1562 if ( transcount < tp + maxid - minid + 1) {
1563 transcount *= 2;
c944940b
JH
1564 trie->trans = (reg_trie_trans *)
1565 PerlMemShared_realloc( trie->trans,
446bd890
NC
1566 transcount
1567 * sizeof(reg_trie_trans) );
a3621e74
YO
1568 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1569 }
1570 base = trie->uniquecharcount + tp - minid;
1571 if ( maxid == minid ) {
1572 U32 set = 0;
1573 for ( ; zp < tp ; zp++ ) {
1574 if ( ! trie->trans[ zp ].next ) {
1575 base = trie->uniquecharcount + zp - minid;
1576 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1577 trie->trans[ zp ].check = state;
1578 set = 1;
1579 break;
1580 }
1581 }
1582 if ( !set ) {
1583 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1584 trie->trans[ tp ].check = state;
1585 tp++;
1586 zp = tp;
1587 }
1588 } else {
1589 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1590 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1591 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1592 trie->trans[ tid ].check = state;
1593 }
1594 tp += ( maxid - minid + 1 );
1595 }
1596 Safefree(trie->states[ state ].trans.list);
1597 }
1598 /*
1599 DEBUG_TRIE_COMPILE_MORE_r(
1600 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1601 );
1602 */
1603 trie->states[ state ].trans.base=base;
1604 }
cc601c31 1605 trie->lasttrans = tp + 1;
a3621e74
YO
1606 }
1607 } else {
1608 /*
1609 Second Pass -- Flat Table Representation.
1610
1611 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1612 We know that we will need Charcount+1 trans at most to store the data
1613 (one row per char at worst case) So we preallocate both structures
1614 assuming worst case.
1615
1616 We then construct the trie using only the .next slots of the entry
1617 structs.
1618
1619 We use the .check field of the first entry of the node temporarily to
1620 make compression both faster and easier by keeping track of how many non
1621 zero fields are in the node.
1622
1623 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1624 transition.
1625
1626 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1627 number representing the first entry of the node, and state as a
1628 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1629 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1630 are 2 entrys per node. eg:
1631
1632 A B A B
1633 1. 2 4 1. 3 7
1634 2. 0 3 3. 0 5
1635 3. 0 0 5. 0 0
1636 4. 0 0 7. 0 0
1637
1638 The table is internally in the right hand, idx form. However as we also
1639 have to deal with the states array which is indexed by nodenum we have to
1640 use TRIE_NODENUM() to convert.
1641
1642 */
1e2e3d02
YO
1643 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1644 "%*sCompiling trie using table compiler\n",
1645 (int)depth * 2 + 2, ""));
3dab1dad 1646
c944940b
JH
1647 trie->trans = (reg_trie_trans *)
1648 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1649 * trie->uniquecharcount + 1,
1650 sizeof(reg_trie_trans) );
1651 trie->states = (reg_trie_state *)
1652 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1653 sizeof(reg_trie_state) );
a3621e74
YO
1654 next_alloc = trie->uniquecharcount + 1;
1655
3dab1dad 1656
a3621e74
YO
1657 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1658
c445ea15 1659 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1660 const U8 *uc = (U8*)STRING( noper );
1661 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1662
1663 U32 state = 1; /* required init */
1664
1665 U16 charid = 0; /* sanity init */
1666 U32 accept_state = 0; /* sanity init */
1667 U8 *scan = (U8*)NULL; /* sanity init */
1668
1669 STRLEN foldlen = 0; /* required init */
07be1b83 1670 U32 wordlen = 0; /* required init */
a3621e74
YO
1671 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1672
3dab1dad 1673 if ( OP(noper) != NOTHING ) {
786e8c11 1674 for ( ; uc < e ; uc += len ) {
a3621e74 1675
786e8c11 1676 TRIE_READ_CHAR;
a3621e74 1677
786e8c11
YO
1678 if ( uvc < 256 ) {
1679 charid = trie->charmap[ uvc ];
1680 } else {
55eed653 1681 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1682 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1683 }
786e8c11
YO
1684 if ( charid ) {
1685 charid--;
1686 if ( !trie->trans[ state + charid ].next ) {
1687 trie->trans[ state + charid ].next = next_alloc;
1688 trie->trans[ state ].check++;
1689 next_alloc += trie->uniquecharcount;
1690 }
1691 state = trie->trans[ state + charid ].next;
1692 } else {
1693 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1694 }
1695 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1696 }
a3621e74 1697 }
3dab1dad
YO
1698 accept_state = TRIE_NODENUM( state );
1699 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1700
1701 } /* end second pass */
1702
3dab1dad 1703 /* and now dump it out before we compress it */
2b8b4781
NC
1704 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1705 revcharmap,
1706 next_alloc, depth+1));
a3621e74 1707
a3621e74
YO
1708 {
1709 /*
1710 * Inplace compress the table.*
1711
1712 For sparse data sets the table constructed by the trie algorithm will
1713 be mostly 0/FAIL transitions or to put it another way mostly empty.
1714 (Note that leaf nodes will not contain any transitions.)
1715
1716 This algorithm compresses the tables by eliminating most such
1717 transitions, at the cost of a modest bit of extra work during lookup:
1718
1719 - Each states[] entry contains a .base field which indicates the
1720 index in the state[] array wheres its transition data is stored.
1721
1722 - If .base is 0 there are no valid transitions from that node.
1723
1724 - If .base is nonzero then charid is added to it to find an entry in
1725 the trans array.
1726
1727 -If trans[states[state].base+charid].check!=state then the
1728 transition is taken to be a 0/Fail transition. Thus if there are fail
1729 transitions at the front of the node then the .base offset will point
1730 somewhere inside the previous nodes data (or maybe even into a node
1731 even earlier), but the .check field determines if the transition is
1732 valid.
1733
786e8c11 1734 XXX - wrong maybe?
a3621e74
YO
1735 The following process inplace converts the table to the compressed
1736 table: We first do not compress the root node 1,and mark its all its
1737 .check pointers as 1 and set its .base pointer as 1 as well. This
1738 allows to do a DFA construction from the compressed table later, and
1739 ensures that any .base pointers we calculate later are greater than
1740 0.
1741
1742 - We set 'pos' to indicate the first entry of the second node.
1743
1744 - We then iterate over the columns of the node, finding the first and
1745 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1746 and set the .check pointers accordingly, and advance pos
1747 appropriately and repreat for the next node. Note that when we copy
1748 the next pointers we have to convert them from the original
1749 NODEIDX form to NODENUM form as the former is not valid post
1750 compression.
1751
1752 - If a node has no transitions used we mark its base as 0 and do not
1753 advance the pos pointer.
1754
1755 - If a node only has one transition we use a second pointer into the
1756 structure to fill in allocated fail transitions from other states.
1757 This pointer is independent of the main pointer and scans forward
1758 looking for null transitions that are allocated to a state. When it
1759 finds one it writes the single transition into the "hole". If the
786e8c11 1760 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1761
1762 - Once compressed we can Renew/realloc the structures to release the
1763 excess space.
1764
1765 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1766 specifically Fig 3.47 and the associated pseudocode.
1767
1768 demq
1769 */
a3b680e6 1770 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1771 U32 state, charid;
a3621e74 1772 U32 pos = 0, zp=0;
1e2e3d02 1773 trie->statecount = laststate;
a3621e74
YO
1774
1775 for ( state = 1 ; state < laststate ; state++ ) {
1776 U8 flag = 0;
a28509cc
AL
1777 const U32 stateidx = TRIE_NODEIDX( state );
1778 const U32 o_used = trie->trans[ stateidx ].check;
1779 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1780 trie->trans[ stateidx ].check = 0;
1781
1782 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1783 if ( flag || trie->trans[ stateidx + charid ].next ) {
1784 if ( trie->trans[ stateidx + charid ].next ) {
1785 if (o_used == 1) {
1786 for ( ; zp < pos ; zp++ ) {
1787 if ( ! trie->trans[ zp ].next ) {
1788 break;
1789 }
1790 }
1791 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1792 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1793 trie->trans[ zp ].check = state;
1794 if ( ++zp > pos ) pos = zp;
1795 break;
1796 }
1797 used--;
1798 }
1799 if ( !flag ) {
1800 flag = 1;
1801 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1802 }
1803 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1804 trie->trans[ pos ].check = state;
1805 pos++;
1806 }
1807 }
1808 }
cc601c31 1809 trie->lasttrans = pos + 1;
c944940b
JH
1810 trie->states = (reg_trie_state *)
1811 PerlMemShared_realloc( trie->states, laststate
1812 * sizeof(reg_trie_state) );
a3621e74 1813 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1814 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1815 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1816 (int)depth * 2 + 2,"",
1817 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1818 (IV)next_alloc,
1819 (IV)pos,
a3621e74
YO
1820 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1821 );
1822
1823 } /* end table compress */
1824 }
1e2e3d02
YO
1825 DEBUG_TRIE_COMPILE_MORE_r(
1826 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1827 (int)depth * 2 + 2, "",
1828 (UV)trie->statecount,
1829 (UV)trie->lasttrans)
1830 );
cc601c31 1831 /* resize the trans array to remove unused space */
c944940b
JH
1832 trie->trans = (reg_trie_trans *)
1833 PerlMemShared_realloc( trie->trans, trie->lasttrans
1834 * sizeof(reg_trie_trans) );
a3621e74 1835
3dab1dad 1836 /* and now dump out the compressed format */
2b8b4781 1837 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1838
3dab1dad 1839 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1840 U8 nodetype =(U8)(flags & 0xFF);
1841 char *str=NULL;
786e8c11 1842
07be1b83 1843#ifdef DEBUGGING
e62cc96a 1844 regnode *optimize = NULL;
b57a0404
JH
1845 U32 mjd_offset = 0;
1846 U32 mjd_nodelen = 0;
07be1b83 1847#endif
a3621e74 1848 /*
3dab1dad
YO
1849 This means we convert either the first branch or the first Exact,
1850 depending on whether the thing following (in 'last') is a branch
1851 or not and whther first is the startbranch (ie is it a sub part of
1852 the alternation or is it the whole thing.)
1853 Assuming its a sub part we conver the EXACT otherwise we convert
1854 the whole branch sequence, including the first.
a3621e74 1855 */
3dab1dad 1856 /* Find the node we are going to overwrite */
7f69552c 1857 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1858 /* branch sub-chain */
3dab1dad 1859 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1860 DEBUG_r({
1861 mjd_offset= Node_Offset((convert));
1862 mjd_nodelen= Node_Length((convert));
1863 });
7f69552c
YO
1864 /* whole branch chain */
1865 } else {
1866 DEBUG_r({
1867 const regnode *nop = NEXTOPER( convert );
1868 mjd_offset= Node_Offset((nop));
1869 mjd_nodelen= Node_Length((nop));
1870 });
07be1b83 1871 }
7f69552c 1872
07be1b83
YO
1873 DEBUG_OPTIMISE_r(
1874 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1875 (int)depth * 2 + 2, "",
786e8c11 1876 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1877 );
a3621e74 1878
3dab1dad
YO
1879 /* But first we check to see if there is a common prefix we can
1880 split out as an EXACT and put in front of the TRIE node. */
1881 trie->startstate= 1;
55eed653 1882 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1883 U32 state;
1e2e3d02 1884 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1885 U32 ofs = 0;
8e11feef
RGS
1886 I32 idx = -1;
1887 U32 count = 0;
1888 const U32 base = trie->states[ state ].trans.base;
a3621e74 1889
3dab1dad 1890 if ( trie->states[state].wordnum )
8e11feef 1891 count = 1;
a3621e74 1892
8e11feef 1893 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1894 if ( ( base + ofs >= trie->uniquecharcount ) &&
1895 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1896 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1897 {
3dab1dad 1898 if ( ++count > 1 ) {
2b8b4781 1899 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1900 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1901 if ( state == 1 ) break;
3dab1dad
YO
1902 if ( count == 2 ) {
1903 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1904 DEBUG_OPTIMISE_r(
8e11feef
RGS
1905 PerlIO_printf(Perl_debug_log,
1906 "%*sNew Start State=%"UVuf" Class: [",
1907 (int)depth * 2 + 2, "",
786e8c11 1908 (UV)state));
be8e71aa 1909 if (idx >= 0) {
2b8b4781 1910 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1911 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1912
3dab1dad 1913 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1914 if ( folder )
1915 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1916 DEBUG_OPTIMISE_r(
07be1b83 1917 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1918 );
8e11feef
RGS
1919 }
1920 }
1921 TRIE_BITMAP_SET(trie,*ch);
1922 if ( folder )
1923 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1924 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1925 }
1926 idx = ofs;
1927 }
3dab1dad
YO
1928 }
1929 if ( count == 1 ) {
2b8b4781 1930 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1931 char *ch = SvPV_nolen( *tmp );
1932 DEBUG_OPTIMISE_r({
1933 SV *sv=sv_newmortal();
8e11feef
RGS
1934 PerlIO_printf( Perl_debug_log,
1935 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1936 (int)depth * 2 + 2, "",
de734bd5
A
1937 (UV)state, (UV)idx,
1938 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1939 PL_colors[0], PL_colors[1],
1940 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1941 PERL_PV_ESCAPE_FIRSTCHAR
1942 )
1943 );
1944 });
3dab1dad
YO
1945 if ( state==1 ) {
1946 OP( convert ) = nodetype;
1947 str=STRING(convert);
1948 STR_LEN(convert)=0;
1949 }
de734bd5
A
1950 while (*ch) {
1951 *str++ = *ch++;
1952 STR_LEN(convert)++;
1953 }
1954
8e11feef 1955 } else {
f9049ba1 1956#ifdef DEBUGGING
8e11feef
RGS
1957 if (state>1)
1958 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1959#endif
8e11feef
RGS
1960 break;
1961 }
1962 }
3dab1dad 1963 if (str) {
8e11feef 1964 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1965 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1966 trie->startstate = state;
07be1b83
YO
1967 trie->minlen -= (state - 1);
1968 trie->maxlen -= (state - 1);
1969 DEBUG_r({
1970 regnode *fix = convert;
de734bd5 1971 U32 word = trie->wordcount;
07be1b83
YO
1972 mjd_nodelen++;
1973 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1974 while( ++fix < n ) {
1975 Set_Node_Offset_Length(fix, 0, 0);
1976 }
de734bd5 1977 while (word--) {
2b8b4781 1978 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1979 if (tmp) {
1980 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1981 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1982 else
1983 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1984 }
1985 }
07be1b83 1986 });
8e11feef
RGS
1987 if (trie->maxlen) {
1988 convert = n;
1989 } else {
3dab1dad 1990 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1991 DEBUG_r(optimize= n);
3dab1dad
YO
1992 }
1993 }
1994 }
a5ca303d
YO
1995 if (!jumper)
1996 jumper = last;
3dab1dad 1997 if ( trie->maxlen ) {
8e11feef
RGS
1998 NEXT_OFF( convert ) = (U16)(tail - convert);
1999 ARG_SET( convert, data_slot );
786e8c11
YO
2000 /* Store the offset to the first unabsorbed branch in
2001 jump[0], which is otherwise unused by the jump logic.
2002 We use this when dumping a trie and during optimisation. */
2003 if (trie->jump)
7f69552c 2004 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2005
786e8c11
YO
2006 /* XXXX */
2007 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2008 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2009 {
2010 OP( convert ) = TRIEC;
2011 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2012 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2013 trie->bitmap= NULL;
2014 } else
2015 OP( convert ) = TRIE;
a3621e74 2016
3dab1dad
YO
2017 /* store the type in the flags */
2018 convert->flags = nodetype;
a5ca303d
YO
2019 DEBUG_r({
2020 optimize = convert
2021 + NODE_STEP_REGNODE
2022 + regarglen[ OP( convert ) ];
2023 });
2024 /* XXX We really should free up the resource in trie now,
2025 as we won't use them - (which resources?) dmq */
3dab1dad 2026 }
a3621e74 2027 /* needed for dumping*/
e62cc96a 2028 DEBUG_r(if (optimize) {
07be1b83 2029 regnode *opt = convert;
e62cc96a 2030 while ( ++opt < optimize) {
07be1b83
YO
2031 Set_Node_Offset_Length(opt,0,0);
2032 }
786e8c11
YO
2033 /*
2034 Try to clean up some of the debris left after the
2035 optimisation.
a3621e74 2036 */
786e8c11 2037 while( optimize < jumper ) {
07be1b83 2038 mjd_nodelen += Node_Length((optimize));
a3621e74 2039 OP( optimize ) = OPTIMIZED;
07be1b83 2040 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2041 optimize++;
2042 }
07be1b83 2043 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2044 });
2045 } /* end node insert */
55eed653 2046 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2047#ifdef DEBUGGING
2048 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2049 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2050#else
2051 SvREFCNT_dec(revcharmap);
07be1b83 2052#endif
786e8c11
YO
2053 return trie->jump
2054 ? MADE_JUMP_TRIE
2055 : trie->startstate>1
2056 ? MADE_EXACT_TRIE
2057 : MADE_TRIE;
2058}
2059
2060STATIC void
2061S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2062{
2063/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2064
2065 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2066 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2067 ISBN 0-201-10088-6
2068
2069 We find the fail state for each state in the trie, this state is the longest proper
2070 suffix of the current states 'word' that is also a proper prefix of another word in our
2071 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2072 the DFA not to have to restart after its tried and failed a word at a given point, it
2073 simply continues as though it had been matching the other word in the first place.
2074 Consider
2075 'abcdgu'=~/abcdefg|cdgu/
2076 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2077 fail, which would bring use to the state representing 'd' in the second word where we would
2078 try 'g' and succeed, prodceding to match 'cdgu'.
2079 */
2080 /* add a fail transition */
3251b653
NC
2081 const U32 trie_offset = ARG(source);
2082 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2083 U32 *q;
2084 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2085 const U32 numstates = trie->statecount;
786e8c11
YO
2086 const U32 ubound = trie->lasttrans + ucharcount;
2087 U32 q_read = 0;
2088 U32 q_write = 0;
2089 U32 charid;
2090 U32 base = trie->states[ 1 ].trans.base;
2091 U32 *fail;
2092 reg_ac_data *aho;
2093 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2094 GET_RE_DEBUG_FLAGS_DECL;
2095#ifndef DEBUGGING
2096 PERL_UNUSED_ARG(depth);
2097#endif
2098
2099
2100 ARG_SET( stclass, data_slot );
c944940b 2101 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2102 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2103 aho->trie=trie_offset;
446bd890
NC
2104 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2105 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2106 Newxz( q, numstates, U32);
c944940b 2107 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2108 aho->refcount = 1;
2109 fail = aho->fail;
2110 /* initialize fail[0..1] to be 1 so that we always have
2111 a valid final fail state */
2112 fail[ 0 ] = fail[ 1 ] = 1;
2113
2114 for ( charid = 0; charid < ucharcount ; charid++ ) {
2115 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2116 if ( newstate ) {
2117 q[ q_write ] = newstate;
2118 /* set to point at the root */
2119 fail[ q[ q_write++ ] ]=1;
2120 }
2121 }
2122 while ( q_read < q_write) {
2123 const U32 cur = q[ q_read++ % numstates ];
2124 base = trie->states[ cur ].trans.base;
2125
2126 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2127 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2128 if (ch_state) {
2129 U32 fail_state = cur;
2130 U32 fail_base;
2131 do {
2132 fail_state = fail[ fail_state ];
2133 fail_base = aho->states[ fail_state ].trans.base;
2134 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2135
2136 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2137 fail[ ch_state ] = fail_state;
2138 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2139 {
2140 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2141 }
2142 q[ q_write++ % numstates] = ch_state;
2143 }
2144 }
2145 }
2146 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2147 when we fail in state 1, this allows us to use the
2148 charclass scan to find a valid start char. This is based on the principle
2149 that theres a good chance the string being searched contains lots of stuff
2150 that cant be a start char.
2151 */
2152 fail[ 0 ] = fail[ 1 ] = 0;
2153 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2154 PerlIO_printf(Perl_debug_log,
2155 "%*sStclass Failtable (%"UVuf" states): 0",
2156 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2157 );
786e8c11
YO
2158 for( q_read=1; q_read<numstates; q_read++ ) {
2159 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2160 }
2161 PerlIO_printf(Perl_debug_log, "\n");
2162 });
2163 Safefree(q);
2164 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2165}
2166
786e8c11 2167
a3621e74 2168/*
5d1c421c
JH
2169 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2170 * These need to be revisited when a newer toolchain becomes available.
2171 */
2172#if defined(__sparc64__) && defined(__GNUC__)
2173# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2174# undef SPARC64_GCC_WORKAROUND
2175# define SPARC64_GCC_WORKAROUND 1
2176# endif
2177#endif
2178
07be1b83 2179#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2180 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2181 SV * const mysv=sv_newmortal(); \
2182 regnode *Next = regnext(scan); \
2183 regprop(RExC_rx, mysv, scan); \
7f69552c 2184 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2185 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2186 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2187 }});
07be1b83 2188
1de06328
YO
2189
2190
2191
2192
07be1b83
YO
2193#define JOIN_EXACT(scan,min,flags) \
2194 if (PL_regkind[OP(scan)] == EXACT) \
2195 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2196
be8e71aa 2197STATIC U32
07be1b83
YO
2198S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2199 /* Merge several consecutive EXACTish nodes into one. */
2200 regnode *n = regnext(scan);
2201 U32 stringok = 1;
2202 regnode *next = scan + NODE_SZ_STR(scan);
2203 U32 merged = 0;
2204 U32 stopnow = 0;
2205#ifdef DEBUGGING
2206 regnode *stop = scan;
72f13be8 2207 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2208#else
d47053eb
RGS
2209 PERL_UNUSED_ARG(depth);
2210#endif
2211#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2212 PERL_UNUSED_ARG(flags);
2213 PERL_UNUSED_ARG(val);
07be1b83 2214#endif
07be1b83
YO
2215 DEBUG_PEEP("join",scan,depth);
2216
2217 /* Skip NOTHING, merge EXACT*. */
2218 while (n &&
2219 ( PL_regkind[OP(n)] == NOTHING ||
2220 (stringok && (OP(n) == OP(scan))))
2221 && NEXT_OFF(n)
2222 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2223
2224 if (OP(n) == TAIL || n > next)
2225 stringok = 0;
2226 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2227 DEBUG_PEEP("skip:",n,depth);
2228 NEXT_OFF(scan) += NEXT_OFF(n);
2229 next = n + NODE_STEP_REGNODE;
2230#ifdef DEBUGGING
2231 if (stringok)
2232 stop = n;
2233#endif
2234 n = regnext(n);
2235 }
2236 else if (stringok) {
786e8c11 2237 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2238 regnode * const nnext = regnext(n);
2239
2240 DEBUG_PEEP("merg",n,depth);
2241
2242 merged++;
2243 if (oldl + STR_LEN(n) > U8_MAX)
2244 break;
2245 NEXT_OFF(scan) += NEXT_OFF(n);
2246 STR_LEN(scan) += STR_LEN(n);
2247 next = n + NODE_SZ_STR(n);
2248 /* Now we can overwrite *n : */
2249 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2250#ifdef DEBUGGING
2251 stop = next - 1;
2252#endif
2253 n = nnext;
2254 if (stopnow) break;
2255 }
2256
d47053eb
RGS
2257#ifdef EXPERIMENTAL_INPLACESCAN
2258 if (flags && !NEXT_OFF(n)) {
2259 DEBUG_PEEP("atch", val, depth);
2260 if (reg_off_by_arg[OP(n)]) {
2261 ARG_SET(n, val - n);
2262 }
2263 else {
2264 NEXT_OFF(n) = val - n;
2265 }
2266 stopnow = 1;
2267 }
07be1b83
YO
2268#endif
2269 }
2270
2271 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2272 /*
2273 Two problematic code points in Unicode casefolding of EXACT nodes:
2274
2275 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2276 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2277
2278 which casefold to
2279
2280 Unicode UTF-8
2281
2282 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2283 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2284
2285 This means that in case-insensitive matching (or "loose matching",
2286 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2287 length of the above casefolded versions) can match a target string
2288 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2289 This would rather mess up the minimum length computation.
2290
2291 What we'll do is to look for the tail four bytes, and then peek
2292 at the preceding two bytes to see whether we need to decrease
2293 the minimum length by four (six minus two).
2294
2295 Thanks to the design of UTF-8, there cannot be false matches:
2296 A sequence of valid UTF-8 bytes cannot be a subsequence of
2297 another valid sequence of UTF-8 bytes.
2298
2299 */
2300 char * const s0 = STRING(scan), *s, *t;
2301 char * const s1 = s0 + STR_LEN(scan) - 1;
2302 char * const s2 = s1 - 4;
e294cc5d
JH
2303#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2304 const char t0[] = "\xaf\x49\xaf\x42";
2305#else
07be1b83 2306 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2307#endif
07be1b83
YO
2308 const char * const t1 = t0 + 3;
2309
2310 for (s = s0 + 2;
2311 s < s2 && (t = ninstr(s, s1, t0, t1));
2312 s = t + 4) {
e294cc5d
JH
2313#ifdef EBCDIC
2314 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2315 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2316#else
07be1b83
YO
2317 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2318 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2319#endif
07be1b83
YO
2320 *min -= 4;
2321 }
2322 }
2323
2324#ifdef DEBUGGING
2325 /* Allow dumping */
2326 n = scan + NODE_SZ_STR(scan);
2327 while (n <= stop) {
2328 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2329 OP(n) = OPTIMIZED;
2330 NEXT_OFF(n) = 0;
2331 }
2332 n++;
2333 }
2334#endif
2335 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2336 return stopnow;
2337}
2338
653099ff
GS
2339/* REx optimizer. Converts nodes into quickier variants "in place".
2340 Finds fixed substrings. */
2341
a0288114 2342/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2343 to the position after last scanned or to NULL. */
2344
40d049e4
YO
2345#define INIT_AND_WITHP \
2346 assert(!and_withp); \
2347 Newx(and_withp,1,struct regnode_charclass_class); \
2348 SAVEFREEPV(and_withp)
07be1b83 2349
b515a41d
YO
2350/* this is a chain of data about sub patterns we are processing that
2351 need to be handled seperately/specially in study_chunk. Its so
2352 we can simulate recursion without losing state. */
2353struct scan_frame;
2354typedef struct scan_frame {
2355 regnode *last; /* last node to process in this frame */
2356 regnode *next; /* next node to process when last is reached */
2357 struct scan_frame *prev; /*previous frame*/
2358 I32 stop; /* what stopparen do we use */
2359} scan_frame;
2360
304ee84b
YO
2361
2362#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2363
76e3520e 2364STATIC I32
40d049e4 2365S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2366 I32 *minlenp, I32 *deltap,
40d049e4
YO
2367 regnode *last,
2368 scan_data_t *data,
2369 I32 stopparen,
2370 U8* recursed,
2371 struct regnode_charclass_class *and_withp,
2372 U32 flags, U32 depth)
c277df42
IZ
2373 /* scanp: Start here (read-write). */
2374 /* deltap: Write maxlen-minlen here. */
2375 /* last: Stop before this one. */
40d049e4
YO
2376 /* data: string data about the pattern */
2377 /* stopparen: treat close N as END */
2378 /* recursed: which subroutines have we recursed into */
2379 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2380{
97aff369 2381 dVAR;
c277df42
IZ
2382 I32 min = 0, pars = 0, code;
2383 regnode *scan = *scanp, *next;
2384 I32 delta = 0;
2385 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2386 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2387 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2388 scan_data_t data_fake;
a3621e74 2389 SV *re_trie_maxbuff = NULL;
786e8c11 2390 regnode *first_non_open = scan;
e2e6a0f1 2391 I32 stopmin = I32_MAX;
8aa23a47
YO
2392 scan_frame *frame = NULL;
2393
a3621e74 2394 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2395
13a24bad 2396#ifdef DEBUGGING
40d049e4 2397 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2398#endif
40d049e4 2399
786e8c11 2400 if ( depth == 0 ) {
40d049e4 2401 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2402 first_non_open=regnext(first_non_open);
2403 }
2404
b81d288d 2405
8aa23a47
YO
2406 fake_study_recurse:
2407 while ( scan && OP(scan) != END && scan < last ){
2408 /* Peephole optimizer: */
304ee84b 2409 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2410 DEBUG_PEEP("Peep",scan,depth);
2411 JOIN_EXACT(scan,&min,0);
2412
2413 /* Follow the next-chain of the current node and optimize
2414 away all the NOTHINGs from it. */
2415 if (OP(scan) != CURLYX) {
2416 const int max = (reg_off_by_arg[OP(scan)]
2417 ? I32_MAX
2418 /* I32 may be smaller than U16 on CRAYs! */
2419 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2420 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2421 int noff;
2422 regnode *n = scan;
2423
2424 /* Skip NOTHING and LONGJMP. */
2425 while ((n = regnext(n))
2426 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2427 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2428 && off + noff < max)
2429 off += noff;
2430 if (reg_off_by_arg[OP(scan)])
2431 ARG(scan) = off;
2432 else
2433 NEXT_OFF(scan) = off;
2434 }
a3621e74 2435
c277df42 2436
8aa23a47
YO
2437
2438 /* The principal pseudo-switch. Cannot be a switch, since we
2439 look into several different things. */
2440 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2441 || OP(scan) == IFTHEN) {
2442 next = regnext(scan);
2443 code = OP(scan);
2444 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2445
2446 if (OP(next) == code || code == IFTHEN) {
2447 /* NOTE - There is similar code to this block below for handling
2448 TRIE nodes on a re-study. If you change stuff here check there
2449 too. */
2450 I32 max1 = 0, min1 = I32_MAX, num = 0;
2451 struct regnode_charclass_class accum;
2452 regnode * const startbranch=scan;
2453
2454 if (flags & SCF_DO_SUBSTR)
304ee84b 2455 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2456 if (flags & SCF_DO_STCLASS)
2457 cl_init_zero(pRExC_state, &accum);
2458
2459 while (OP(scan) == code) {
2460 I32 deltanext, minnext, f = 0, fake;
2461 struct regnode_charclass_class this_class;
2462
2463 num++;
2464 data_fake.flags = 0;
2465 if (data) {
2466 data_fake.whilem_c = data->whilem_c;
2467 data_fake.last_closep = data->last_closep;
2468 }
2469 else
2470 data_fake.last_closep = &fake;
58e23c8d
YO
2471
2472 data_fake.pos_delta = delta;
8aa23a47
YO
2473 next = regnext(scan);
2474 scan = NEXTOPER(scan);
2475 if (code != BRANCH)
c277df42 2476 scan = NEXTOPER(scan);
8aa23a47
YO
2477 if (flags & SCF_DO_STCLASS) {
2478 cl_init(pRExC_state, &this_class);
2479 data_fake.start_class = &this_class;
2480 f = SCF_DO_STCLASS_AND;
58e23c8d 2481 }
8aa23a47
YO
2482 if (flags & SCF_WHILEM_VISITED_POS)
2483 f |= SCF_WHILEM_VISITED_POS;
2484
2485 /* we suppose the run is continuous, last=next...*/
2486 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2487 next, &data_fake,
2488 stopparen, recursed, NULL, f,depth+1);
2489 if (min1 > minnext)
2490 min1 = minnext;
2491 if (max1 < minnext + deltanext)
2492 max1 = minnext + deltanext;
2493 if (deltanext == I32_MAX)
2494 is_inf = is_inf_internal = 1;
2495 scan = next;
2496 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2497 pars++;
2498 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2499 if ( stopmin > minnext)
2500 stopmin = min + min1;
2501 flags &= ~SCF_DO_SUBSTR;
2502 if (data)
2503 data->flags |= SCF_SEEN_ACCEPT;
2504 }
2505 if (data) {
2506 if (data_fake.flags & SF_HAS_EVAL)
2507 data->flags |= SF_HAS_EVAL;
2508 data->whilem_c = data_fake.whilem_c;
3dab1dad 2509 }
8aa23a47
YO
2510 if (flags & SCF_DO_STCLASS)
2511 cl_or(pRExC_state, &accum, &this_class);
2512 }
2513 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2514 min1 = 0;
2515 if (flags & SCF_DO_SUBSTR) {
2516 data->pos_min += min1;
2517 data->pos_delta += max1 - min1;
2518 if (max1 != min1 || is_inf)
2519 data->longest = &(data->longest_float);
2520 }
2521 min += min1;
2522 delta += max1 - min1;
2523 if (flags & SCF_DO_STCLASS_OR) {
2524 cl_or(pRExC_state, data->start_class, &accum);
2525 if (min1) {
2526 cl_and(data->start_class, and_withp);
2527 flags &= ~SCF_DO_STCLASS;
653099ff 2528 }
8aa23a47
YO
2529 }
2530 else if (flags & SCF_DO_STCLASS_AND) {
2531 if (min1) {
2532 cl_and(data->start_class, &accum);
2533 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2534 }
8aa23a47
YO
2535 else {
2536 /* Switch to OR mode: cache the old value of
2537 * data->start_class */
2538 INIT_AND_WITHP;
2539 StructCopy(data->start_class, and_withp,
2540 struct regnode_charclass_class);
2541 flags &= ~SCF_DO_STCLASS_AND;
2542 StructCopy(&accum, data->start_class,
2543 struct regnode_charclass_class);
2544 flags |= SCF_DO_STCLASS_OR;
2545 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2546 }
8aa23a47 2547 }
a3621e74 2548
8aa23a47
YO
2549 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2550 /* demq.
a3621e74 2551
8aa23a47
YO
2552 Assuming this was/is a branch we are dealing with: 'scan' now
2553 points at the item that follows the branch sequence, whatever
2554 it is. We now start at the beginning of the sequence and look
2555 for subsequences of
a3621e74 2556
8aa23a47
YO
2557 BRANCH->EXACT=>x1
2558 BRANCH->EXACT=>x2
2559 tail
a3621e74 2560
8aa23a47 2561 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2562
8aa23a47
YO
2563 If we can find such a subseqence we need to turn the first
2564 element into a trie and then add the subsequent branch exact
2565 strings to the trie.
a3621e74 2566
8aa23a47 2567 We have two cases
a3621e74 2568
8aa23a47 2569 1. patterns where the whole set of branch can be converted.
a3621e74 2570
8aa23a47 2571 2. patterns where only a subset can be converted.
a3621e74 2572
8aa23a47
YO
2573 In case 1 we can replace the whole set with a single regop
2574 for the trie. In case 2 we need to keep the start and end
2575 branchs so
a3621e74 2576
8aa23a47
YO
2577 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2578 becomes BRANCH TRIE; BRANCH X;
786e8c11 2579
8aa23a47
YO
2580 There is an additional case, that being where there is a
2581 common prefix, which gets split out into an EXACT like node
2582 preceding the TRIE node.
a3621e74 2583
8aa23a47
YO
2584 If x(1..n)==tail then we can do a simple trie, if not we make
2585 a "jump" trie, such that when we match the appropriate word
2586 we "jump" to the appopriate tail node. Essentailly we turn
2587 a nested if into a case structure of sorts.
b515a41d 2588
8aa23a47
YO
2589 */
2590
2591 int made=0;
2592 if (!re_trie_maxbuff) {
2593 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2594 if (!SvIOK(re_trie_maxbuff))
2595 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2596 }
2597 if ( SvIV(re_trie_maxbuff)>=0 ) {
2598 regnode *cur;
2599 regnode *first = (regnode *)NULL;
2600 regnode *last = (regnode *)NULL;
2601 regnode *tail = scan;
2602 U8 optype = 0;
2603 U32 count=0;
a3621e74
YO
2604
2605#ifdef DEBUGGING
8aa23a47 2606 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2607#endif
8aa23a47
YO
2608 /* var tail is used because there may be a TAIL
2609 regop in the way. Ie, the exacts will point to the
2610 thing following the TAIL, but the last branch will
2611 point at the TAIL. So we advance tail. If we
2612 have nested (?:) we may have to move through several
2613 tails.
2614 */
2615
2616 while ( OP( tail ) == TAIL ) {
2617 /* this is the TAIL generated by (?:) */
2618 tail = regnext( tail );
2619 }
a3621e74 2620
8aa23a47
YO
2621
2622 DEBUG_OPTIMISE_r({
2623 regprop(RExC_rx, mysv, tail );
2624 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2625 (int)depth * 2 + 2, "",
2626 "Looking for TRIE'able sequences. Tail node is: ",
2627 SvPV_nolen_const( mysv )
2628 );
2629 });
2630
2631 /*
2632
2633 step through the branches, cur represents each
2634 branch, noper is the first thing to be matched
2635 as part of that branch and noper_next is the
2636 regnext() of that node. if noper is an EXACT
2637 and noper_next is the same as scan (our current
2638 position in the regex) then the EXACT branch is
2639 a possible optimization target. Once we have
2640 two or more consequetive such branches we can
2641 create a trie of the EXACT's contents and stich
2642 it in place. If the sequence represents all of
2643 the branches we eliminate the whole thing and
2644 replace it with a single TRIE. If it is a
2645 subsequence then we need to stitch it in. This
2646 means the first branch has to remain, and needs
2647 to be repointed at the item on the branch chain
2648 following the last branch optimized. This could
2649 be either a BRANCH, in which case the
2650 subsequence is internal, or it could be the
2651 item following the branch sequence in which
2652 case the subsequence is at the end.
2653
2654 */
2655
2656 /* dont use tail as the end marker for this traverse */
2657 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2658 regnode * const noper = NEXTOPER( cur );
b515a41d 2659#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2660 regnode * const noper_next = regnext( noper );
b515a41d
YO
2661#endif
2662
8aa23a47
YO
2663 DEBUG_OPTIMISE_r({
2664 regprop(RExC_rx, mysv, cur);
2665 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2666 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2667
2668 regprop(RExC_rx, mysv, noper);
2669 PerlIO_printf( Perl_debug_log, " -> %s",
2670 SvPV_nolen_const(mysv));
2671
2672 if ( noper_next ) {
2673 regprop(RExC_rx, mysv, noper_next );
2674 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2675 SvPV_nolen_const(mysv));
2676 }
2677 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2678 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2679 });
2680 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2681 : PL_regkind[ OP( noper ) ] == EXACT )
2682 || OP(noper) == NOTHING )
786e8c11 2683#ifdef NOJUMPTRIE
8aa23a47 2684 && noper_next == tail
786e8c11 2685#endif
8aa23a47
YO
2686 && count < U16_MAX)
2687 {
2688 count++;
2689 if ( !first || optype == NOTHING ) {
2690 if (!first) first = cur;
2691 optype = OP( noper );
2692 } else {
2693 last = cur;
2694 }
2695 } else {
2696 if ( last ) {
2697 make_trie( pRExC_state,
2698 startbranch, first, cur, tail, count,
2699 optype, depth+1 );
2700 }
2701 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2702#ifdef NOJUMPTRIE
8aa23a47 2703 && noper_next == tail
786e8c11 2704#endif
8aa23a47
YO
2705 ){
2706 count = 1;
2707 first = cur;
2708 optype = OP( noper );
2709 } else {
2710 count = 0;
2711 first = NULL;
2712 optype = 0;
2713 }
2714 last = NULL;
2715 }
2716 }
2717 DEBUG_OPTIMISE_r({
2718 regprop(RExC_rx, mysv, cur);
2719 PerlIO_printf( Perl_debug_log,
2720 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2721 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2722
2723 });
2724 if ( last ) {
2725 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2726#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2727 if ( ((made == MADE_EXACT_TRIE &&
2728 startbranch == first)
2729 || ( first_non_open == first )) &&
2730 depth==0 ) {
2731 flags |= SCF_TRIE_RESTUDY;
2732 if ( startbranch == first
2733 && scan == tail )
2734 {
2735 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2736 }
2737 }
3dab1dad 2738#endif
8aa23a47
YO
2739 }
2740 }
2741
2742 } /* do trie */
2743
653099ff 2744 }
8aa23a47
YO
2745 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2746 scan = NEXTOPER(NEXTOPER(scan));
2747 } else /* single branch is optimized. */
2748 scan = NEXTOPER(scan);
2749 continue;
2750 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2751 scan_frame *newframe = NULL;
2752 I32 paren;
2753 regnode *start;
2754 regnode *end;
2755
2756 if (OP(scan) != SUSPEND) {
2757 /* set the pointer */
2758 if (OP(scan) == GOSUB) {
2759 paren = ARG(scan);
2760 RExC_recurse[ARG2L(scan)] = scan;
2761 start = RExC_open_parens[paren-1];
2762 end = RExC_close_parens[paren-1];
2763 } else {
2764 paren = 0;
f8fc2ecf 2765 start = RExC_rxi->program + 1;
8aa23a47
YO
2766 end = RExC_opend;
2767 }
2768 if (!recursed) {
2769 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2770 SAVEFREEPV(recursed);
2771 }
2772 if (!PAREN_TEST(recursed,paren+1)) {
2773 PAREN_SET(recursed,paren+1);
2774 Newx(newframe,1,scan_frame);
2775 } else {
2776 if (flags & SCF_DO_SUBSTR) {
304ee84b 2777 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2778 data->longest = &(data->longest_float);
2779 }
2780 is_inf = is_inf_internal = 1;
2781 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2782 cl_anything(pRExC_state, data->start_class);
2783 flags &= ~SCF_DO_STCLASS;
2784 }
2785 } else {
2786 Newx(newframe,1,scan_frame);
2787 paren = stopparen;
2788 start = scan+2;
2789 end = regnext(scan);
2790 }
2791 if (newframe) {
2792 assert(start);
2793 assert(end);
2794 SAVEFREEPV(newframe);
2795 newframe->next = regnext(scan);
2796 newframe->last = last;
2797 newframe->stop = stopparen;
2798 newframe->prev = frame;
2799
2800 frame = newframe;
2801 scan = start;
2802 stopparen = paren;
2803 last = end;
2804
2805 continue;
2806 }
2807 }
2808 else if (OP(scan) == EXACT) {
2809 I32 l = STR_LEN(scan);
2810 UV uc;
2811 if (UTF) {
2812 const U8 * const s = (U8*)STRING(scan);
2813 l = utf8_length(s, s + l);
2814 uc = utf8_to_uvchr(s, NULL);
2815 } else {
2816 uc = *((U8*)STRING(scan));
2817 }
2818 min += l;
2819 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2820 /* The code below prefers earlier match for fixed
2821 offset, later match for variable offset. */
2822 if (data->last_end == -1) { /* Update the start info. */
2823 data->last_start_min = data->pos_min;
2824 data->last_start_max = is_inf
2825 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2826 }
8aa23a47
YO
2827 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2828 if (UTF)
2829 SvUTF8_on(data->last_found);
2830 {
2831 SV * const sv = data->last_found;
2832 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2833 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2834 if (mg && mg->mg_len >= 0)
2835 mg->mg_len += utf8_length((U8*)STRING(scan),
2836 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2837 }
8aa23a47
YO
2838 data->last_end = data->pos_min + l;
2839 data->pos_min += l; /* As in the first entry. */
2840 data->flags &= ~SF_BEFORE_EOL;
2841 }
2842 if (flags & SCF_DO_STCLASS_AND) {
2843 /* Check whether it is compatible with what we know already! */
2844 int compat = 1;
2845
2846 if (uc >= 0x100 ||
2847 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2848 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2849 && (!(data->start_class->flags & ANYOF_FOLD)
2850 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2851 )
2852 compat = 0;
2853 ANYOF_CLASS_ZERO(data->start_class);
2854 ANYOF_BITMAP_ZERO(data->start_class);
2855 if (compat)
2856 ANYOF_BITMAP_SET(data->start_class, uc);
2857 data->start_class->flags &= ~ANYOF_EOS;
2858 if (uc < 0x100)
2859 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2860 }
2861 else if (flags & SCF_DO_STCLASS_OR) {
2862 /* false positive possible if the class is case-folded */
2863 if (uc < 0x100)
2864 ANYOF_BITMAP_SET(data->start_class, uc);
2865 else
2866 data->start_class->flags |= ANYOF_UNICODE_ALL;
2867 data->start_class->flags &= ~ANYOF_EOS;
2868 cl_and(data->start_class, and_withp);
2869 }
2870 flags &= ~SCF_DO_STCLASS;
2871 }
2872 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2873 I32 l = STR_LEN(scan);
2874 UV uc = *((U8*)STRING(scan));
2875
2876 /* Search for fixed substrings supports EXACT only. */
2877 if (flags & SCF_DO_SUBSTR) {
2878 assert(data);
304ee84b 2879 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2880 }
2881 if (UTF) {
2882 const U8 * const s = (U8 *)STRING(scan);
2883 l = utf8_length(s, s + l);
2884 uc = utf8_to_uvchr(s, NULL);
2885 }
2886 min += l;
2887 if (flags & SCF_DO_SUBSTR)
2888 data->pos_min += l;
2889 if (flags & SCF_DO_STCLASS_AND) {
2890 /* Check whether it is compatible with what we know already! */
2891 int compat = 1;
2892
2893 if (uc >= 0x100 ||
2894 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2895 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2896 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2897 compat = 0;
2898 ANYOF_CLASS_ZERO(data->start_class);
2899 ANYOF_BITMAP_ZERO(data->start_class);
2900 if (compat) {
2901 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2902 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2903 data->start_class->flags |= ANYOF_FOLD;
2904 if (OP(scan) == EXACTFL)
2905 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2906 }
8aa23a47
YO
2907 }
2908 else if (flags & SCF_DO_STCLASS_OR) {
2909 if (data->start_class->flags & ANYOF_FOLD) {
2910 /* false positive possible if the class is case-folded.
2911 Assume that the locale settings are the same... */
1aa99e6b
IH
2912 if (uc < 0x100)
2913 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2914 data->start_class->flags &= ~ANYOF_EOS;
2915 }
8aa23a47 2916 cl_and(data->start_class, and_withp);
653099ff 2917 }
8aa23a47
YO
2918 flags &= ~SCF_DO_STCLASS;
2919 }
2920 else if (strchr((const char*)PL_varies,OP(scan))) {
2921 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2922 I32 f = flags, pos_before = 0;
2923 regnode * const oscan = scan;
2924 struct regnode_charclass_class this_class;
2925 struct regnode_charclass_class *oclass = NULL;
2926 I32 next_is_eval = 0;
2927
2928 switch (PL_regkind[OP(scan)]) {
2929 case WHILEM: /* End of (?:...)* . */
2930 scan = NEXTOPER(scan);
2931 goto finish;
2932 case PLUS:
2933 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2934 next = NEXTOPER(scan);
2935 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2936 mincount = 1;
2937 maxcount = REG_INFTY;
2938 next = regnext(scan);
2939 scan = NEXTOPER(scan);
2940 goto do_curly;
2941 }
2942 }
2943 if (flags & SCF_DO_SUBSTR)
2944 data->pos_min++;
2945 min++;
2946 /* Fall through. */
2947 case STAR:
2948 if (flags & SCF_DO_STCLASS) {
2949 mincount = 0;
2950 maxcount = REG_INFTY;
2951 next = regnext(scan);
2952 scan = NEXTOPER(scan);
2953 goto do_curly;
2954 }
2955 is_inf = is_inf_internal = 1;
2956 scan = regnext(scan);
c277df42 2957 if (flags & SCF_DO_SUBSTR) {
304ee84b 2958 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2959 data->longest = &(data->longest_float);
c277df42 2960 }
8aa23a47
YO
2961 goto optimize_curly_tail;
2962 case CURLY:
2963 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2964 && (scan->flags == stopparen))
2965 {
2966 mincount = 1;
2967 maxcount = 1;
2968 } else {
2969 mincount = ARG1(scan);
2970 maxcount = ARG2(scan);
653099ff 2971 }
8aa23a47
YO
2972 next = regnext(scan);
2973 if (OP(scan) == CURLYX) {
2974 I32 lp = (data ? *(data->last_closep) : 0);
2975 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2976 }
8aa23a47
YO
2977 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2978 next_is_eval = (OP(scan) == EVAL);
2979 do_curly:
2980 if (flags & SCF_DO_SUBSTR) {
304ee84b 2981 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 2982 pos_before = data->pos_min;
b45f050a 2983 }
8aa23a47
YO
2984 if (data) {
2985 fl = data->flags;
2986 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2987 if (is_inf)
2988 data->flags |= SF_IS_INF;
2989 }
2990 if (flags & SCF_DO_STCLASS) {
2991 cl_init(pRExC_state, &this_class);
2992 oclass = data->start_class;
2993 data->start_class = &this_class;
2994 f |= SCF_DO_STCLASS_AND;
2995 f &= ~SCF_DO_STCLASS_OR;
2996 }
2997 /* These are the cases when once a subexpression
2998 fails at a particular position, it cannot succeed
2999 even after backtracking at the enclosing scope.
3000
3001 XXXX what if minimal match and we are at the
3002 initial run of {n,m}? */
3003 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3004 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3005
8aa23a47
YO
3006 /* This will finish on WHILEM, setting scan, or on NULL: */
3007 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3008 last, data, stopparen, recursed, NULL,
3009 (mincount == 0
3010 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3011
8aa23a47
YO
3012 if (flags & SCF_DO_STCLASS)
3013 data->start_class = oclass;
3014 if (mincount == 0 || minnext == 0) {
3015 if (flags & SCF_DO_STCLASS_OR) {
3016 cl_or(pRExC_state, data->start_class, &this_class);
3017 }
3018 else if (flags & SCF_DO_STCLASS_AND) {
3019 /* Switch to OR mode: cache the old value of
3020 * data->start_class */
3021 INIT_AND_WITHP;
3022 StructCopy(data->start_class, and_withp,
3023 struct regnode_charclass_class);
3024 flags &= ~SCF_DO_STCLASS_AND;
3025 StructCopy(&this_class, data->start_class,
3026 struct regnode_charclass_class);
3027 flags |= SCF_DO_STCLASS_OR;
3028 data->start_class->flags |= ANYOF_EOS;
3029 }
3030 } else { /* Non-zero len */
3031 if (flags & SCF_DO_STCLASS_OR) {
3032 cl_or(pRExC_state, data->start_class, &this_class);
3033 cl_and(data->start_class, and_withp);
3034 }
3035 else if (flags & SCF_DO_STCLASS_AND)
3036 cl_and(data->start_class, &this_class);
3037 flags &= ~SCF_DO_STCLASS;
3038 }
3039 if (!scan) /* It was not CURLYX, but CURLY. */
3040 scan = next;
3041 if ( /* ? quantifier ok, except for (?{ ... }) */
3042 (next_is_eval || !(mincount == 0 && maxcount == 1))
3043 && (minnext == 0) && (deltanext == 0)
3044 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3045 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3046 && ckWARN(WARN_REGEXP))
3047 {
3048 vWARN(RExC_parse,
3049 "Quantifier unexpected on zero-length expression");
3050 }
3051
3052 min += minnext * mincount;
3053 is_inf_internal |= ((maxcount == REG_INFTY
3054 && (minnext + deltanext) > 0)
3055 || deltanext == I32_MAX);
3056 is_inf |= is_inf_internal;
3057 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3058
3059 /* Try powerful optimization CURLYX => CURLYN. */
3060 if ( OP(oscan) == CURLYX && data
3061 && data->flags & SF_IN_PAR
3062 && !(data->flags & SF_HAS_EVAL)
3063 && !deltanext && minnext == 1 ) {
3064 /* Try to optimize to CURLYN. */
3065 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3066 regnode * const nxt1 = nxt;
497b47a8 3067#ifdef DEBUGGING
8aa23a47 3068 regnode *nxt2;
497b47a8 3069#endif
c277df42 3070
8aa23a47
YO
3071 /* Skip open. */
3072 nxt = regnext(nxt);
3073 if (!strchr((const char*)PL_simple,OP(nxt))
3074 && !(PL_regkind[OP(nxt)] == EXACT
3075 && STR_LEN(nxt) == 1))
3076 goto nogo;
497b47a8 3077#ifdef DEBUGGING
8aa23a47 3078 nxt2 = nxt;
497b47a8 3079#endif
8aa23a47
YO
3080 nxt = regnext(nxt);
3081 if (OP(nxt) != CLOSE)
3082 goto nogo;
3083 if (RExC_open_parens) {
3084 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3085 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3086 }
3087 /* Now we know that nxt2 is the only contents: */
3088 oscan->flags = (U8)ARG(nxt);
3089 OP(oscan) = CURLYN;
3090 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3091
c277df42 3092#ifdef DEBUGGING
8aa23a47
YO
3093 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3094 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3095 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3096 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3097 OP(nxt + 1) = OPTIMIZED; /* was count. */
3098 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3099#endif
8aa23a47
YO
3100 }
3101 nogo:
3102
3103 /* Try optimization CURLYX => CURLYM. */
3104 if ( OP(oscan) == CURLYX && data
3105 && !(data->flags & SF_HAS_PAR)
3106 && !(data->flags & SF_HAS_EVAL)
3107 && !deltanext /* atom is fixed width */
3108 && minnext != 0 /* CURLYM can't handle zero width */
3109 ) {
3110 /* XXXX How to optimize if data == 0? */
3111 /* Optimize to a simpler form. */
3112 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3113 regnode *nxt2;
3114
3115 OP(oscan) = CURLYM;
3116 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3117 && (OP(nxt2) != WHILEM))
3118 nxt = nxt2;
3119 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3120 /* Need to optimize away parenths. */
3121 if (data->flags & SF_IN_PAR) {
3122 /* Set the parenth number. */
3123 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3124
3125 if (OP(nxt) != CLOSE)
3126 FAIL("Panic opt close");
3127 oscan->flags = (U8)ARG(nxt);
3128 if (RExC_open_parens) {
3129 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3130 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3131 }
8aa23a47
YO
3132 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3133 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3134
c277df42 3135#ifdef DEBUGGING
8aa23a47
YO
3136 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3137 OP(nxt + 1) = OPTIMIZED; /* was count. */
3138 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3139 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3140#endif
c277df42 3141#if 0
8aa23a47
YO
3142 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3143 regnode *nnxt = regnext(nxt1);
3144
3145 if (nnxt == nxt) {
3146 if (reg_off_by_arg[OP(nxt1)])
3147 ARG_SET(nxt1, nxt2 - nxt1);
3148 else if (nxt2 - nxt1 < U16_MAX)
3149 NEXT_OFF(nxt1) = nxt2 - nxt1;
3150 else
3151 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3152 }
8aa23a47 3153 nxt1 = nnxt;
c277df42 3154 }
5d1c421c 3155#endif
8aa23a47
YO
3156 /* Optimize again: */
3157 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3158 NULL, stopparen, recursed, NULL, 0,depth+1);
3159 }
3160 else
3161 oscan->flags = 0;
3162 }
3163 else if ((OP(oscan) == CURLYX)
3164 && (flags & SCF_WHILEM_VISITED_POS)
3165 /* See the comment on a similar expression above.
3166 However, this time it not a subexpression
3167 we care about, but the expression itself. */
3168 && (maxcount == REG_INFTY)
3169 && data && ++data->whilem_c < 16) {
3170 /* This stays as CURLYX, we can put the count/of pair. */
3171 /* Find WHILEM (as in regexec.c) */
3172 regnode *nxt = oscan + NEXT_OFF(oscan);
3173
3174 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3175 nxt += ARG(nxt);
3176 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3177 | (RExC_whilem_seen << 4)); /* On WHILEM */
3178 }
3179 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3180 pars++;
3181 if (flags & SCF_DO_SUBSTR) {
3182 SV *last_str = NULL;
3183 int counted = mincount != 0;
a0ed51b3 3184
8aa23a47
YO
3185 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3186#if defined(SPARC64_GCC_WORKAROUND)
3187 I32 b = 0;
3188 STRLEN l = 0;
3189 const char *s = NULL;
3190 I32 old = 0;
b515a41d 3191
8aa23a47
YO
3192 if (pos_before >= data->last_start_min)
3193 b = pos_before;
3194 else
3195 b = data->last_start_min;
b515a41d 3196
8aa23a47
YO
3197 l = 0;
3198 s = SvPV_const(data->last_found, l);
3199 old = b - data->last_start_min;
3200
3201#else
3202 I32 b = pos_before >= data->last_start_min
3203 ? pos_before : data->last_start_min;
3204 STRLEN l;
3205 const char * const s = SvPV_const(data->last_found, l);
3206 I32 old = b - data->last_start_min;
3207#endif
3208
3209 if (UTF)
3210 old = utf8_hop((U8*)s, old) - (U8*)s;
3211
3212 l -= old;
3213 /* Get the added string: */
3214 last_str = newSVpvn(s + old, l);
3215 if (UTF)
3216 SvUTF8_on(last_str);
3217 if (deltanext == 0 && pos_before == b) {
3218 /* What was added is a constant string */
3219 if (mincount > 1) {
3220 SvGROW(last_str, (mincount * l) + 1);
3221 repeatcpy(SvPVX(last_str) + l,
3222 SvPVX_const(last_str), l, mincount - 1);
3223 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3224 /* Add additional parts. */
3225 SvCUR_set(data->last_found,
3226 SvCUR(data->last_found) - l);
3227 sv_catsv(data->last_found, last_str);
3228 {
3229 SV * sv = data->last_found;
3230 MAGIC *mg =
3231 SvUTF8(sv) && SvMAGICAL(sv) ?
3232 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3233 if (mg && mg->mg_len >= 0)
3234 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3235 }
8aa23a47 3236 data->last_end += l * (mincount - 1);
b515a41d 3237 }
8aa23a47
YO
3238 } else {
3239 /* start offset must point into the last copy */
3240 data->last_start_min += minnext * (mincount - 1);
3241 data->last_start_max += is_inf ? I32_MAX
3242 : (maxcount - 1) * (minnext + data->pos_delta);
3243 }
c277df42 3244 }
8aa23a47
YO
3245 /* It is counted once already... */
3246 data->pos_min += minnext * (mincount - counted);
3247 data->pos_delta += - counted * deltanext +
3248 (minnext + deltanext) * maxcount - minnext * mincount;
3249 if (mincount != maxcount) {
3250 /* Cannot extend fixed substrings found inside
3251 the group. */
304ee84b 3252 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3253 if (mincount && last_str) {
3254 SV * const sv = data->last_found;
3255 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3256 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3257
3258 if (mg)
3259 mg->mg_len = -1;
3260 sv_setsv(sv, last_str);
3261 data->last_end = data->pos_min;
3262 data->last_start_min =
3263 data->pos_min - CHR_SVLEN(last_str);
3264 data->last_start_max = is_inf
3265 ? I32_MAX
3266 : data->pos_min + data->pos_delta
3267 - CHR_SVLEN(last_str);
3268 }
3269 data->longest = &(data->longest_float);
3270 }
3271 SvREFCNT_dec(last_str);
c277df42 3272 }
8aa23a47
YO
3273 if (data && (fl & SF_HAS_EVAL))
3274 data->flags |= SF_HAS_EVAL;
3275 optimize_curly_tail:
3276 if (OP(oscan) != CURLYX) {
3277 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3278 && NEXT_OFF(next))
3279 NEXT_OFF(oscan) += NEXT_OFF(next);
3280 }
3281 continue;
3282 default: /* REF and CLUMP only? */
3283 if (flags & SCF_DO_SUBSTR) {
304ee84b 3284 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3285 data->longest = &(data->longest_float);
3286 }
3287 is_inf = is_inf_internal = 1;
3288 if (flags & SCF_DO_STCLASS_OR)
3289 cl_anything(pRExC_state, data->start_class);
3290 flags &= ~SCF_DO_STCLASS;
3291 break;
c277df42 3292 }
8aa23a47
YO
3293 }
3294 else if (strchr((const char*)PL_simple,OP(scan))) {
3295 int value = 0;
653099ff 3296
8aa23a47 3297 if (flags & SCF_DO_SUBSTR) {
304ee84b 3298 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3299 data->pos_min++;
3300 }
3301 min++;
3302 if (flags & SCF_DO_STCLASS) {
3303 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3304
8aa23a47
YO
3305 /* Some of the logic below assumes that switching
3306 locale on will only add false positives. */
3307 switch (PL_regkind[OP(scan)]) {
3308 case SANY:
3309 default:
3310 do_default:
3311 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3312 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3313 cl_anything(pRExC_state, data->start_class);
3314 break;
3315 case REG_ANY:
3316 if (OP(scan) == SANY)
3317 goto do_default;
3318 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3319 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3320 || (data->start_class->flags & ANYOF_CLASS));
3321 cl_anything(pRExC_state, data->start_class);
653099ff 3322 }
8aa23a47
YO
3323 if (flags & SCF_DO_STCLASS_AND || !value)
3324 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3325 break;
3326 case ANYOF:
3327 if (flags & SCF_DO_STCLASS_AND)
3328 cl_and(data->start_class,
3329 (struct regnode_charclass_class*)scan);
653099ff 3330 else
8aa23a47
YO
3331 cl_or(pRExC_state, data->start_class,
3332 (struct regnode_charclass_class*)scan);
3333 break;
3334 case ALNUM:
3335 if (flags & SCF_DO_STCLASS_AND) {
3336 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3337 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3338 for (value = 0; value < 256; value++)
3339 if (!isALNUM(value))
3340 ANYOF_BITMAP_CLEAR(data->start_class, value);
3341 }
653099ff 3342 }
8aa23a47
YO
3343 else {
3344 if (data->start_class->flags & ANYOF_LOCALE)
3345 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3346 else {
3347 for (value = 0; value < 256; value++)
3348 if (isALNUM(value))
3349 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3350 }
8aa23a47
YO
3351 }
3352 break;
3353 case ALNUML:
3354 if (flags & SCF_DO_STCLASS_AND) {
3355 if (data->start_class->flags & ANYOF_LOCALE)
3356 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3357 }
3358 else {
3359 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3360 data->start_class->flags |= ANYOF_LOCALE;
3361 }
3362 break;
3363 case NALNUM:
3364 if (flags & SCF_DO_STCLASS_AND) {
3365 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3366 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3367 for (value = 0; value < 256; value++)
3368 if (isALNUM(value))
3369 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3370 }
3371 }
8aa23a47
YO
3372 else {
3373 if (data->start_class->flags & ANYOF_LOCALE)
3374 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3375 else {
3376 for (value = 0; value < 256; value++)
3377 if (!isALNUM(value))
3378 ANYOF_BITMAP_SET(data->start_class, value);
3379 }
653099ff 3380 }
8aa23a47
YO
3381 break;
3382 case NALNUML:
3383 if (flags & SCF_DO_STCLASS_AND) {
3384 if (data->start_class->flags & ANYOF_LOCALE)
3385 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3386 }
8aa23a47
YO
3387 else {
3388 data->start_class->flags |= ANYOF_LOCALE;
3389 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3390 }
3391 break;
3392 case SPACE:
3393 if (flags & SCF_DO_STCLASS_AND) {
3394 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3395 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3396 for (value = 0; value < 256; value++)
3397 if (!isSPACE(value))
3398 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3399 }
3400 }
8aa23a47
YO
3401 else {
3402 if (data->start_class->flags & ANYOF_LOCALE)
3403 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3404 else {
3405 for (value = 0; value < 256; value++)
3406 if (isSPACE(value))
3407 ANYOF_BITMAP_SET(data->start_class, value);
3408 }
653099ff 3409 }
8aa23a47
YO
3410 break;
3411 case SPACEL:
3412 if (flags & SCF_DO_STCLASS_AND) {
3413 if (data->start_class->flags & ANYOF_LOCALE)
3414 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3415 }
3416 else {
3417 data->start_class->flags |= ANYOF_LOCALE;
3418 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3419 }
3420 break;
3421 case NSPACE:
3422 if (flags & SCF_DO_STCLASS_AND) {
3423 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3425 for (value = 0; value < 256; value++)
3426 if (isSPACE(value))
3427 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3428 }
8aa23a47
YO
3429 }
3430 else {
3431 if (data->start_class->flags & ANYOF_LOCALE)
3432 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3433 else {
3434 for (value = 0; value < 256; value++)
3435 if (!isSPACE(value))
3436 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3437 }
3438 }
8aa23a47
YO
3439 break;
3440 case NSPACEL:
3441 if (flags & SCF_DO_STCLASS_AND) {
3442 if (data->start_class->flags & ANYOF_LOCALE) {
3443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3444 for (value = 0; value < 256; value++)
3445 if (!isSPACE(value))
3446 ANYOF_BITMAP_CLEAR(data->start_class, value);
3447 }
653099ff 3448 }
8aa23a47
YO
3449 else {
3450 data->start_class->flags |= ANYOF_LOCALE;
3451 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3452 }
3453 break;
3454 case DIGIT:
3455 if (flags & SCF_DO_STCLASS_AND) {
3456 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3457 for (value = 0; value < 256; value++)
3458 if (!isDIGIT(value))
3459 ANYOF_BITMAP_CLEAR(data->start_class, value);
3460 }
3461 else {
3462 if (data->start_class->flags & ANYOF_LOCALE)
3463 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3464 else {
3465 for (value = 0; value < 256; value++)
3466 if (isDIGIT(value))
3467 ANYOF_BITMAP_SET(data->start_class, value);
3468 }
3469 }
3470 break;
3471 case NDIGIT:
3472 if (flags & SCF_DO_STCLASS_AND) {
3473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3474 for (value = 0; value < 256; value++)
3475 if (isDIGIT(value))
3476 ANYOF_BITMAP_CLEAR(data->start_class, value);
3477 }
3478 else {
3479 if (data->start_class->flags & ANYOF_LOCALE)
3480 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3481 else {
3482 for (value = 0; value < 256; value++)
3483 if (!isDIGIT(value))
3484 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3485 }
3486 }
8aa23a47
YO
3487 break;
3488 }
3489 if (flags & SCF_DO_STCLASS_OR)
3490 cl_and(data->start_class, and_withp);
3491 flags &= ~SCF_DO_STCLASS;
3492 }
3493 }
3494 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3495 data->flags |= (OP(scan) == MEOL
3496 ? SF_BEFORE_MEOL
3497 : SF_BEFORE_SEOL);
3498 }
3499 else if ( PL_regkind[OP(scan)] == BRANCHJ
3500 /* Lookbehind, or need to calculate parens/evals/stclass: */
3501 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3502 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3503 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3504 || OP(scan) == UNLESSM )
3505 {
3506 /* Negative Lookahead/lookbehind
3507 In this case we can't do fixed string optimisation.
3508 */
1de06328 3509
8aa23a47
YO
3510 I32 deltanext, minnext, fake = 0;
3511 regnode *nscan;
3512 struct regnode_charclass_class intrnl;
3513 int f = 0;
1de06328 3514
8aa23a47
YO
3515 data_fake.flags = 0;
3516 if (data) {
3517 data_fake.whilem_c = data->whilem_c;
3518 data_fake.last_closep = data->last_closep;
c277df42 3519 }
8aa23a47
YO
3520 else
3521 data_fake.last_closep = &fake;
58e23c8d 3522 data_fake.pos_delta = delta;
8aa23a47
YO
3523 if ( flags & SCF_DO_STCLASS && !scan->flags
3524 && OP(scan) == IFMATCH ) { /* Lookahead */
3525 cl_init(pRExC_state, &intrnl);
3526 data_fake.start_class = &intrnl;
3527 f |= SCF_DO_STCLASS_AND;
3528 }
3529 if (flags & SCF_WHILEM_VISITED_POS)
3530 f |= SCF_WHILEM_VISITED_POS;
3531 next = regnext(scan);
3532 nscan = NEXTOPER(NEXTOPER(scan));
3533 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3534 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3535 if (scan->flags) {
3536 if (deltanext) {
58e23c8d 3537 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3538 }
3539 else if (minnext > (I32)U8_MAX) {
58e23c8d 3540 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3541 }
3542 scan->flags = (U8)minnext;
3543 }
3544 if (data) {
3545 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3546 pars++;
3547 if (data_fake.flags & SF_HAS_EVAL)
3548 data->flags |= SF_HAS_EVAL;
3549 data->whilem_c = data_fake.whilem_c;
3550 }
3551 if (f & SCF_DO_STCLASS_AND) {
3552 const int was = (data->start_class->flags & ANYOF_EOS);
3553
3554 cl_and(data->start_class, &intrnl);
3555 if (was)
3556 data->start_class->flags |= ANYOF_EOS;
3557 }
cb434fcc 3558 }
8aa23a47
YO
3559#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3560 else {
3561 /* Positive Lookahead/lookbehind
3562 In this case we can do fixed string optimisation,
3563 but we must be careful about it. Note in the case of
3564 lookbehind the positions will be offset by the minimum
3565 length of the pattern, something we won't know about
3566 until after the recurse.
3567 */
3568 I32 deltanext, fake = 0;
3569 regnode *nscan;
3570 struct regnode_charclass_class intrnl;
3571 int f = 0;
3572 /* We use SAVEFREEPV so that when the full compile
3573 is finished perl will clean up the allocated
3574 minlens when its all done. This was we don't
3575 have to worry about freeing them when we know
3576 they wont be used, which would be a pain.
3577 */
3578 I32 *minnextp;
3579 Newx( minnextp, 1, I32 );
3580 SAVEFREEPV(minnextp);
3581
3582 if (data) {
3583 StructCopy(data, &data_fake, scan_data_t);
3584 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3585 f |= SCF_DO_SUBSTR;
3586 if (scan->flags)
304ee84b 3587 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3588 data_fake.last_found=newSVsv(data->last_found);
3589 }
3590 }
3591 else
3592 data_fake.last_closep = &fake;
3593 data_fake.flags = 0;
58e23c8d 3594 data_fake.pos_delta = delta;
8aa23a47
YO
3595 if (is_inf)
3596 data_fake.flags |= SF_IS_INF;
3597 if ( flags & SCF_DO_STCLASS && !scan->flags
3598 && OP(scan) == IFMATCH ) { /* Lookahead */
3599 cl_init(pRExC_state, &intrnl);
3600 data_fake.start_class = &intrnl;
3601 f |= SCF_DO_STCLASS_AND;
3602 }
3603 if (flags & SCF_WHILEM_VISITED_POS)
3604 f |= SCF_WHILEM_VISITED_POS;
3605 next = regnext(scan);
3606 nscan = NEXTOPER(NEXTOPER(scan));
3607
3608 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3609 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3610 if (scan->flags) {
3611 if (deltanext) {
58e23c8d 3612 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3613 }
3614 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3615 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3616 }
3617 scan->flags = (U8)*minnextp;
3618 }
3619
3620 *minnextp += min;
3621
3622 if (f & SCF_DO_STCLASS_AND) {
3623 const int was = (data->start_class->flags & ANYOF_EOS);
3624
3625 cl_and(data->start_class, &intrnl);
3626 if (was)
3627 data->start_class->flags |= ANYOF_EOS;
3628 }
3629 if (data) {
3630 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3631 pars++;
3632 if (data_fake.flags & SF_HAS_EVAL)
3633 data->flags |= SF_HAS_EVAL;
3634 data->whilem_c = data_fake.whilem_c;
3635 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3636 if (RExC_rx->minlen<*minnextp)
3637 RExC_rx->minlen=*minnextp;
304ee84b 3638 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3639 SvREFCNT_dec(data_fake.last_found);
3640
3641 if ( data_fake.minlen_fixed != minlenp )
3642 {
3643 data->offset_fixed= data_fake.offset_fixed;
3644 data->minlen_fixed= data_fake.minlen_fixed;
3645 data->lookbehind_fixed+= scan->flags;
3646 }
3647 if ( data_fake.minlen_float != minlenp )
3648 {
3649 data->minlen_float= data_fake.minlen_float;
3650 data->offset_float_min=data_fake.offset_float_min;
3651 data->offset_float_max=data_fake.offset_float_max;
3652 data->lookbehind_float+= scan->flags;
3653 }
3654 }
3655 }
3656
3657
40d049e4 3658 }
8aa23a47
YO
3659#endif
3660 }
3661 else if (OP(scan) == OPEN) {
3662 if (stopparen != (I32)ARG(scan))
3663 pars++;
3664 }
3665 else if (OP(scan) == CLOSE) {
3666 if (stopparen == (I32)ARG(scan)) {
3667 break;
3668 }
3669 if ((I32)ARG(scan) == is_par) {
3670 next = regnext(scan);
b515a41d 3671
8aa23a47
YO
3672 if ( next && (OP(next) != WHILEM) && next < last)
3673 is_par = 0; /* Disable optimization */
40d049e4 3674 }
8aa23a47
YO
3675 if (data)
3676 *(data->last_closep) = ARG(scan);
3677 }
3678 else if (OP(scan) == EVAL) {
c277df42
IZ
3679 if (data)
3680 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3681 }
3682 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3683 if (flags & SCF_DO_SUBSTR) {
304ee84b 3684 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3685 flags &= ~SCF_DO_SUBSTR;
40d049e4 3686 }
8aa23a47
YO
3687 if (data && OP(scan)==ACCEPT) {
3688 data->flags |= SCF_SEEN_ACCEPT;
3689 if (stopmin > min)
3690 stopmin = min;
e2e6a0f1 3691 }
8aa23a47
YO
3692 }
3693 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3694 {
0f5d15d6 3695 if (flags & SCF_DO_SUBSTR) {
304ee84b 3696 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3697 data->longest = &(data->longest_float);
3698 }
3699 is_inf = is_inf_internal = 1;
653099ff 3700 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3701 cl_anything(pRExC_state, data->start_class);
96776eda 3702 flags &= ~SCF_DO_STCLASS;
8aa23a47 3703 }
58e23c8d 3704 else if (OP(scan) == GPOS) {
bbe252da 3705 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3706 !(delta || is_inf || (data && data->pos_delta)))
3707 {
bbe252da
YO
3708 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3709 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3710 if (RExC_rx->gofs < (U32)min)
3711 RExC_rx->gofs = min;
3712 } else {
bbe252da 3713 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3714 RExC_rx->gofs = 0;
3715 }
3716 }
786e8c11 3717#ifdef TRIE_STUDY_OPT
40d049e4 3718#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3719 else if (PL_regkind[OP(scan)] == TRIE) {
3720 /* NOTE - There is similar code to this block above for handling
3721 BRANCH nodes on the initial study. If you change stuff here
3722 check there too. */
3723 regnode *trie_node= scan;
3724 regnode *tail= regnext(scan);
f8fc2ecf 3725 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3726 I32 max1 = 0, min1 = I32_MAX;
3727 struct regnode_charclass_class accum;
3728
3729 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3730 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3731 if (flags & SCF_DO_STCLASS)
3732 cl_init_zero(pRExC_state, &accum);
3733
3734 if (!trie->jump) {
3735 min1= trie->minlen;
3736 max1= trie->maxlen;
3737 } else {
3738 const regnode *nextbranch= NULL;
3739 U32 word;
3740
3741 for ( word=1 ; word <= trie->wordcount ; word++)
3742 {
3743 I32 deltanext=0, minnext=0, f = 0, fake;
3744 struct regnode_charclass_class this_class;
3745
3746 data_fake.flags = 0;
3747 if (data) {
3748 data_fake.whilem_c = data->whilem_c;
3749 data_fake.last_closep = data->last_closep;
3750 }
3751 else
3752 data_fake.last_closep = &fake;
58e23c8d 3753 data_fake.pos_delta = delta;
8aa23a47
YO
3754 if (flags & SCF_DO_STCLASS) {
3755 cl_init(pRExC_state, &this_class);
3756 data_fake.start_class = &this_class;
3757 f = SCF_DO_STCLASS_AND;
3758 }
3759 if (flags & SCF_WHILEM_VISITED_POS)
3760 f |= SCF_WHILEM_VISITED_POS;
3761
3762 if (trie->jump[word]) {
3763 if (!nextbranch)
3764 nextbranch = trie_node + trie->jump[0];
3765 scan= trie_node + trie->jump[word];
3766 /* We go from the jump point to the branch that follows
3767 it. Note this means we need the vestigal unused branches
3768 even though they arent otherwise used.
3769 */
3770 minnext = study_chunk(pRExC_state, &scan, minlenp,
3771 &deltanext, (regnode *)nextbranch, &data_fake,
3772 stopparen, recursed, NULL, f,depth+1);
3773 }
3774 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3775 nextbranch= regnext((regnode*)nextbranch);
3776
3777 if (min1 > (I32)(minnext + trie->minlen))
3778 min1 = minnext + trie->minlen;
3779 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3780 max1 = minnext + deltanext + trie->maxlen;
3781 if (deltanext == I32_MAX)
3782 is_inf = is_inf_internal = 1;
3783
3784 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3785 pars++;
3786 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3787 if ( stopmin > min + min1)
3788 stopmin = min + min1;
3789 flags &= ~SCF_DO_SUBSTR;
3790 if (data)
3791 data->flags |= SCF_SEEN_ACCEPT;
3792 }
3793 if (data) {
3794 if (data_fake.flags & SF_HAS_EVAL)
3795 data->flags |= SF_HAS_EVAL;
3796 data->whilem_c = data_fake.whilem_c;
3797 }
3798 if (flags & SCF_DO_STCLASS)
3799 cl_or(pRExC_state, &accum, &this_class);
3800 }
3801 }
3802 if (flags & SCF_DO_SUBSTR) {
3803 data->pos_min += min1;
3804 data->pos_delta += max1 - min1;
3805 if (max1 != min1 || is_inf)
3806 data->longest = &(data->longest_float);
3807 }
3808 min += min1;
3809 delta += max1 - min1;
3810 if (flags & SCF_DO_STCLASS_OR) {
3811 cl_or(pRExC_state, data->start_class, &accum);
3812 if (min1) {
3813 cl_and(data->start_class, and_withp);
3814 flags &= ~SCF_DO_STCLASS;
3815 }
3816 }
3817 else if (flags & SCF_DO_STCLASS_AND) {
3818