This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Typo fix, by Robin Barker
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 154#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 155#define RExC_emit (pRExC_state->emit)
fac92740 156#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
157#define RExC_naughty (pRExC_state->naughty)
158#define RExC_sawback (pRExC_state->sawback)
159#define RExC_seen (pRExC_state->seen)
160#define RExC_size (pRExC_state->size)
161#define RExC_npar (pRExC_state->npar)
e2e6a0f1 162#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
163#define RExC_extralen (pRExC_state->extralen)
164#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
165#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 166#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 167#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
168#define RExC_open_parens (pRExC_state->open_parens)
169#define RExC_close_parens (pRExC_state->close_parens)
170#define RExC_opend (pRExC_state->opend)
81714fb9 171#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
172#define RExC_recurse (pRExC_state->recurse)
173#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 174
cde0cee5 175
a687059c
LW
176#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
177#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
178 ((*s) == '{' && regcurly(s)))
a687059c 179
35c8bce7
LW
180#ifdef SPSTART
181#undef SPSTART /* dratted cpp namespace... */
182#endif
a687059c
LW
183/*
184 * Flags to be passed up and down.
185 */
a687059c 186#define WORST 0 /* Worst case. */
821b33a5 187#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
188#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
189#define SPSTART 0x4 /* Starts with * or +. */
190#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 191
3dab1dad
YO
192#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
193
07be1b83
YO
194/* whether trie related optimizations are enabled */
195#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
196#define TRIE_STUDY_OPT
786e8c11 197#define FULL_TRIE_STUDY
07be1b83
YO
198#define TRIE_STCLASS
199#endif
1de06328
YO
200
201
40d049e4
YO
202
203#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
204#define PBITVAL(paren) (1 << ((paren) & 7))
205#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
206#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
207#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
208
209
1de06328
YO
210/* About scan_data_t.
211
212 During optimisation we recurse through the regexp program performing
213 various inplace (keyhole style) optimisations. In addition study_chunk
214 and scan_commit populate this data structure with information about
215 what strings MUST appear in the pattern. We look for the longest
216 string that must appear for at a fixed location, and we look for the
217 longest string that may appear at a floating location. So for instance
218 in the pattern:
219
220 /FOO[xX]A.*B[xX]BAR/
221
222 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
223 strings (because they follow a .* construct). study_chunk will identify
224 both FOO and BAR as being the longest fixed and floating strings respectively.
225
226 The strings can be composites, for instance
227
228 /(f)(o)(o)/
229
230 will result in a composite fixed substring 'foo'.
231
232 For each string some basic information is maintained:
233
234 - offset or min_offset
235 This is the position the string must appear at, or not before.
236 It also implicitly (when combined with minlenp) tells us how many
237 character must match before the string we are searching.
238 Likewise when combined with minlenp and the length of the string
239 tells us how many characters must appear after the string we have
240 found.
241
242 - max_offset
243 Only used for floating strings. This is the rightmost point that
244 the string can appear at. Ifset to I32 max it indicates that the
245 string can occur infinitely far to the right.
246
247 - minlenp
248 A pointer to the minimum length of the pattern that the string
249 was found inside. This is important as in the case of positive
250 lookahead or positive lookbehind we can have multiple patterns
251 involved. Consider
252
253 /(?=FOO).*F/
254
255 The minimum length of the pattern overall is 3, the minimum length
256 of the lookahead part is 3, but the minimum length of the part that
257 will actually match is 1. So 'FOO's minimum length is 3, but the
258 minimum length for the F is 1. This is important as the minimum length
259 is used to determine offsets in front of and behind the string being
260 looked for. Since strings can be composites this is the length of the
261 pattern at the time it was commited with a scan_commit. Note that
262 the length is calculated by study_chunk, so that the minimum lengths
263 are not known until the full pattern has been compiled, thus the
264 pointer to the value.
265
266 - lookbehind
267
268 In the case of lookbehind the string being searched for can be
269 offset past the start point of the final matching string.
270 If this value was just blithely removed from the min_offset it would
271 invalidate some of the calculations for how many chars must match
272 before or after (as they are derived from min_offset and minlen and
273 the length of the string being searched for).
274 When the final pattern is compiled and the data is moved from the
275 scan_data_t structure into the regexp structure the information
276 about lookbehind is factored in, with the information that would
277 have been lost precalculated in the end_shift field for the
278 associated string.
279
280 The fields pos_min and pos_delta are used to store the minimum offset
281 and the delta to the maximum offset at the current point in the pattern.
282
283*/
2c2d71f5
JH
284
285typedef struct scan_data_t {
1de06328
YO
286 /*I32 len_min; unused */
287 /*I32 len_delta; unused */
2c2d71f5
JH
288 I32 pos_min;
289 I32 pos_delta;
290 SV *last_found;
1de06328 291 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
292 I32 last_start_min;
293 I32 last_start_max;
1de06328
YO
294 SV **longest; /* Either &l_fixed, or &l_float. */
295 SV *longest_fixed; /* longest fixed string found in pattern */
296 I32 offset_fixed; /* offset where it starts */
297 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
299 SV *longest_float; /* longest floating string found in pattern */
300 I32 offset_float_min; /* earliest point in string it can appear */
301 I32 offset_float_max; /* latest point in string it can appear */
302 I32 *minlen_float; /* pointer to the minlen relevent to the string */
303 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
304 I32 flags;
305 I32 whilem_c;
cb434fcc 306 I32 *last_closep;
653099ff 307 struct regnode_charclass_class *start_class;
2c2d71f5
JH
308} scan_data_t;
309
a687059c 310/*
e50aee73 311 * Forward declarations for pregcomp()'s friends.
a687059c 312 */
a0d0e21e 313
27da23d5 314static const scan_data_t zero_scan_data =
1de06328 315 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
316
317#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
318#define SF_BEFORE_SEOL 0x0001
319#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
320#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
321#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
322
09b7f37c
CB
323#ifdef NO_UNARY_PLUS
324# define SF_FIX_SHIFT_EOL (0+2)
325# define SF_FL_SHIFT_EOL (0+4)
326#else
327# define SF_FIX_SHIFT_EOL (+2)
328# define SF_FL_SHIFT_EOL (+4)
329#endif
c277df42
IZ
330
331#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
332#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
333
334#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
335#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
336#define SF_IS_INF 0x0040
337#define SF_HAS_PAR 0x0080
338#define SF_IN_PAR 0x0100
339#define SF_HAS_EVAL 0x0200
340#define SCF_DO_SUBSTR 0x0400
653099ff
GS
341#define SCF_DO_STCLASS_AND 0x0800
342#define SCF_DO_STCLASS_OR 0x1000
343#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 344#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 345
786e8c11 346#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 347#define SCF_SEEN_ACCEPT 0x8000
07be1b83 348
eb160463 349#define UTF (RExC_utf8 != 0)
bbe252da
YO
350#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
351#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 352
ffc61ed2 353#define OOB_UNICODE 12345678
93733859 354#define OOB_NAMEDCLASS -1
b8c5462f 355
a0ed51b3
LW
356#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
357#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
358
8615cb43 359
b45f050a
JF
360/* length of regex to show in messages that don't mark a position within */
361#define RegexLengthToShowInErrorMessages 127
362
363/*
364 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
365 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
366 * op/pragma/warn/regcomp.
367 */
7253e4e3
RK
368#define MARKER1 "<-- HERE" /* marker as it appears in the description */
369#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 370
7253e4e3 371#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
372
373/*
374 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
375 * arg. Show regex, up to a maximum length. If it's too long, chop and add
376 * "...".
377 */
58e23c8d 378#define _FAIL(code) STMT_START { \
bfed75c6 379 const char *ellipses = ""; \
ccb2c380
MP
380 IV len = RExC_end - RExC_precomp; \
381 \
382 if (!SIZE_ONLY) \
383 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
384 if (len > RegexLengthToShowInErrorMessages) { \
385 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
386 len = RegexLengthToShowInErrorMessages - 10; \
387 ellipses = "..."; \
388 } \
58e23c8d 389 code; \
ccb2c380 390} STMT_END
8615cb43 391
58e23c8d
YO
392#define FAIL(msg) _FAIL( \
393 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
394 msg, (int)len, RExC_precomp, ellipses))
395
396#define FAIL2(msg,arg) _FAIL( \
397 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
398 arg, (int)len, RExC_precomp, ellipses))
399
b45f050a 400/*
b45f050a
JF
401 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
402 */
ccb2c380 403#define Simple_vFAIL(m) STMT_START { \
a28509cc 404 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
405 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
406 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
407} STMT_END
b45f050a
JF
408
409/*
410 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
411 */
ccb2c380
MP
412#define vFAIL(m) STMT_START { \
413 if (!SIZE_ONLY) \
414 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
415 Simple_vFAIL(m); \
416} STMT_END
b45f050a
JF
417
418/*
419 * Like Simple_vFAIL(), but accepts two arguments.
420 */
ccb2c380 421#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 422 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
423 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
424 (int)offset, RExC_precomp, RExC_precomp + offset); \
425} STMT_END
b45f050a
JF
426
427/*
428 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
429 */
ccb2c380
MP
430#define vFAIL2(m,a1) STMT_START { \
431 if (!SIZE_ONLY) \
432 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
433 Simple_vFAIL2(m, a1); \
434} STMT_END
b45f050a
JF
435
436
437/*
438 * Like Simple_vFAIL(), but accepts three arguments.
439 */
ccb2c380 440#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 441 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
442 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
443 (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
b45f050a
JF
445
446/*
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
448 */
ccb2c380
MP
449#define vFAIL3(m,a1,a2) STMT_START { \
450 if (!SIZE_ONLY) \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
452 Simple_vFAIL3(m, a1, a2); \
453} STMT_END
b45f050a
JF
454
455/*
456 * Like Simple_vFAIL(), but accepts four arguments.
457 */
ccb2c380 458#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 459 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
b45f050a 463
ccb2c380 464#define vWARN(loc,m) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
467 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARNdep(loc,m) STMT_START { \
a28509cc 471 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
472 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
473 "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
475} STMT_END
476
477
478#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
489
490#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 491 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
494} STMT_END
495
496#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 497 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
500} STMT_END
9d1d55b5 501
8615cb43 502
cd439c50 503/* Allow for side effects in s */
ccb2c380
MP
504#define REGC(c,s) STMT_START { \
505 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
506} STMT_END
cd439c50 507
fac92740
MJD
508/* Macros for recording node offsets. 20001227 mjd@plover.com
509 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
510 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
511 * Element 0 holds the number n.
07be1b83 512 * Position is 1 indexed.
fac92740
MJD
513 */
514
ccb2c380
MP
515#define Set_Node_Offset_To_R(node,byte) STMT_START { \
516 if (! SIZE_ONLY) { \
517 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 518 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 519 if((node) < 0) { \
551405c4 520 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
521 } else { \
522 RExC_offsets[2*(node)-1] = (byte); \
523 } \
524 } \
525} STMT_END
526
527#define Set_Node_Offset(node,byte) \
528 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
529#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
530
531#define Set_Node_Length_To_R(node,len) STMT_START { \
532 if (! SIZE_ONLY) { \
533 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 534 __LINE__, (int)(node), (int)(len))); \
ccb2c380 535 if((node) < 0) { \
551405c4 536 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
537 } else { \
538 RExC_offsets[2*(node)] = (len); \
539 } \
540 } \
541} STMT_END
542
543#define Set_Node_Length(node,len) \
544 Set_Node_Length_To_R((node)-RExC_emit_start, len)
545#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
546#define Set_Node_Cur_Length(node) \
547 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
548
549/* Get offsets and lengths */
550#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
551#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
552
07be1b83
YO
553#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
555 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
556} STMT_END
557
558
559#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
560#define EXPERIMENTAL_INPLACESCAN
561#endif
562
304ee84b
YO
563#define DEBUG_STUDYDATA(str,data,depth) \
564DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 565 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
566 "%*s" str "Pos:%"IVdf"/%"IVdf \
567 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
568 (int)(depth)*2, "", \
569 (IV)((data)->pos_min), \
570 (IV)((data)->pos_delta), \
304ee84b 571 (UV)((data)->flags), \
1de06328 572 (IV)((data)->whilem_c), \
304ee84b
YO
573 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
574 is_inf ? "INF " : "" \
1de06328
YO
575 ); \
576 if ((data)->last_found) \
577 PerlIO_printf(Perl_debug_log, \
578 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
579 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
580 SvPVX_const((data)->last_found), \
581 (IV)((data)->last_end), \
582 (IV)((data)->last_start_min), \
583 (IV)((data)->last_start_max), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
586 SvPVX_const((data)->longest_fixed), \
587 (IV)((data)->offset_fixed), \
588 ((data)->longest && \
589 (data)->longest==&((data)->longest_float)) ? "*" : "", \
590 SvPVX_const((data)->longest_float), \
591 (IV)((data)->offset_float_min), \
592 (IV)((data)->offset_float_max) \
593 ); \
594 PerlIO_printf(Perl_debug_log,"\n"); \
595});
596
acfe0abc 597static void clear_re(pTHX_ void *r);
4327152a 598
653099ff 599/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 600 Update the longest found anchored substring and the longest found
653099ff
GS
601 floating substrings if needed. */
602
4327152a 603STATIC void
304ee84b 604S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 605{
e1ec3a88
AL
606 const STRLEN l = CHR_SVLEN(data->last_found);
607 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 608 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 609
c277df42 610 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 611 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
612 if (*data->longest == data->longest_fixed) {
613 data->offset_fixed = l ? data->last_start_min : data->pos_min;
614 if (data->flags & SF_BEFORE_EOL)
b81d288d 615 data->flags
c277df42
IZ
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
619 data->minlen_fixed=minlenp;
620 data->lookbehind_fixed=0;
a0ed51b3 621 }
304ee84b 622 else { /* *data->longest == data->longest_float */
c277df42 623 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
624 data->offset_float_max = (l
625 ? data->last_start_max
c277df42 626 : data->pos_min + data->pos_delta);
304ee84b 627 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 628 data->offset_float_max = I32_MAX;
c277df42 629 if (data->flags & SF_BEFORE_EOL)
b81d288d 630 data->flags
c277df42
IZ
631 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
632 else
633 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
634 data->minlen_float=minlenp;
635 data->lookbehind_float=0;
c277df42
IZ
636 }
637 }
638 SvCUR_set(data->last_found, 0);
0eda9292 639 {
a28509cc 640 SV * const sv = data->last_found;
097eb12c
AL
641 if (SvUTF8(sv) && SvMAGICAL(sv)) {
642 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
643 if (mg)
644 mg->mg_len = 0;
645 }
0eda9292 646 }
c277df42
IZ
647 data->last_end = -1;
648 data->flags &= ~SF_BEFORE_EOL;
304ee84b 649 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
650}
651
653099ff
GS
652/* Can match anything (initialization) */
653STATIC void
097eb12c 654S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 655{
653099ff 656 ANYOF_CLASS_ZERO(cl);
f8bef550 657 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 658 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
659 if (LOC)
660 cl->flags |= ANYOF_LOCALE;
661}
662
663/* Can match anything (initialization) */
664STATIC int
5f66b61c 665S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
666{
667 int value;
668
aaa51d5e 669 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
670 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
671 return 1;
1aa99e6b
IH
672 if (!(cl->flags & ANYOF_UNICODE_ALL))
673 return 0;
10edeb5d 674 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 675 return 0;
653099ff
GS
676 return 1;
677}
678
679/* Can match anything (initialization) */
680STATIC void
097eb12c 681S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 682{
8ecf7187 683 Zero(cl, 1, struct regnode_charclass_class);
653099ff 684 cl->type = ANYOF;
830247a4 685 cl_anything(pRExC_state, cl);
653099ff
GS
686}
687
688STATIC void
097eb12c 689S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 690{
8ecf7187 691 Zero(cl, 1, struct regnode_charclass_class);
653099ff 692 cl->type = ANYOF;
830247a4 693 cl_anything(pRExC_state, cl);
653099ff
GS
694 if (LOC)
695 cl->flags |= ANYOF_LOCALE;
696}
697
698/* 'And' a given class with another one. Can create false positives */
699/* We assume that cl is not inverted */
700STATIC void
5f66b61c 701S_cl_and(struct regnode_charclass_class *cl,
a28509cc 702 const struct regnode_charclass_class *and_with)
653099ff 703{
40d049e4
YO
704
705 assert(and_with->type == ANYOF);
653099ff
GS
706 if (!(and_with->flags & ANYOF_CLASS)
707 && !(cl->flags & ANYOF_CLASS)
708 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
709 && !(and_with->flags & ANYOF_FOLD)
710 && !(cl->flags & ANYOF_FOLD)) {
711 int i;
712
713 if (and_with->flags & ANYOF_INVERT)
714 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
715 cl->bitmap[i] &= ~and_with->bitmap[i];
716 else
717 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
718 cl->bitmap[i] &= and_with->bitmap[i];
719 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
720 if (!(and_with->flags & ANYOF_EOS))
721 cl->flags &= ~ANYOF_EOS;
1aa99e6b 722
14ebb1a2
JH
723 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
724 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
725 cl->flags &= ~ANYOF_UNICODE_ALL;
726 cl->flags |= ANYOF_UNICODE;
727 ARG_SET(cl, ARG(and_with));
728 }
14ebb1a2
JH
729 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
730 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 731 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
732 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
733 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 734 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
735}
736
737/* 'OR' a given class with another one. Can create false positives */
738/* We assume that cl is not inverted */
739STATIC void
097eb12c 740S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 741{
653099ff
GS
742 if (or_with->flags & ANYOF_INVERT) {
743 /* We do not use
744 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
745 * <= (B1 | !B2) | (CL1 | !CL2)
746 * which is wasteful if CL2 is small, but we ignore CL2:
747 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
748 * XXXX Can we handle case-fold? Unclear:
749 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
750 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
751 */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && !(or_with->flags & ANYOF_FOLD)
754 && !(cl->flags & ANYOF_FOLD) ) {
755 int i;
756
757 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
758 cl->bitmap[i] |= ~or_with->bitmap[i];
759 } /* XXXX: logic is complicated otherwise */
760 else {
830247a4 761 cl_anything(pRExC_state, cl);
653099ff
GS
762 }
763 } else {
764 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
765 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 766 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
767 || (cl->flags & ANYOF_FOLD)) ) {
768 int i;
769
770 /* OR char bitmap and class bitmap separately */
771 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
772 cl->bitmap[i] |= or_with->bitmap[i];
773 if (or_with->flags & ANYOF_CLASS) {
774 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
775 cl->classflags[i] |= or_with->classflags[i];
776 cl->flags |= ANYOF_CLASS;
777 }
778 }
779 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 780 cl_anything(pRExC_state, cl);
653099ff
GS
781 }
782 }
783 if (or_with->flags & ANYOF_EOS)
784 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
785
786 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
787 ARG(cl) != ARG(or_with)) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
791 if (or_with->flags & ANYOF_UNICODE_ALL) {
792 cl->flags |= ANYOF_UNICODE_ALL;
793 cl->flags &= ~ANYOF_UNICODE;
794 }
653099ff
GS
795}
796
a3621e74
YO
797#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
798#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
799#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
800#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
801
3dab1dad
YO
802
803#ifdef DEBUGGING
07be1b83 804/*
2b8b4781
NC
805 dump_trie(trie,widecharmap,revcharmap)
806 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
807 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
808
809 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
810 The _interim_ variants are used for debugging the interim
811 tables that are used to generate the final compressed
812 representation which is what dump_trie expects.
813
3dab1dad
YO
814 Part of the reason for their existance is to provide a form
815 of documentation as to how the different representations function.
07be1b83
YO
816
817*/
3dab1dad
YO
818
819/*
3dab1dad
YO
820 Dumps the final compressed table form of the trie to Perl_debug_log.
821 Used for debugging make_trie().
822*/
823
824STATIC void
2b8b4781
NC
825S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
826 AV *revcharmap, U32 depth)
3dab1dad
YO
827{
828 U32 state;
ab3bbdeb 829 SV *sv=sv_newmortal();
55eed653 830 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
831 GET_RE_DEBUG_FLAGS_DECL;
832
ab3bbdeb 833
3dab1dad
YO
834 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
835 (int)depth * 2 + 2,"",
836 "Match","Base","Ofs" );
837
838 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 839 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 840 if ( tmp ) {
ab3bbdeb
YO
841 PerlIO_printf( Perl_debug_log, "%*s",
842 colwidth,
ddc5bc0f 843 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
844 PL_colors[0], PL_colors[1],
845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
846 PERL_PV_ESCAPE_FIRSTCHAR
847 )
848 );
3dab1dad
YO
849 }
850 }
851 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
852 (int)depth * 2 + 2,"");
853
854 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 855 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
856 PerlIO_printf( Perl_debug_log, "\n");
857
1e2e3d02 858 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 859 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
860
861 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
862
863 if ( trie->states[ state ].wordnum ) {
864 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
865 } else {
866 PerlIO_printf( Perl_debug_log, "%6s", "" );
867 }
868
869 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
870
871 if ( base ) {
872 U32 ofs = 0;
873
874 while( ( base + ofs < trie->uniquecharcount ) ||
875 ( base + ofs - trie->uniquecharcount < trie->lasttrans
876 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
877 ofs++;
878
879 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
880
881 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
882 if ( ( base + ofs >= trie->uniquecharcount ) &&
883 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
884 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
885 {
ab3bbdeb
YO
886 PerlIO_printf( Perl_debug_log, "%*"UVXf,
887 colwidth,
3dab1dad
YO
888 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
889 } else {
ab3bbdeb 890 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
891 }
892 }
893
894 PerlIO_printf( Perl_debug_log, "]");
895
896 }
897 PerlIO_printf( Perl_debug_log, "\n" );
898 }
899}
900/*
3dab1dad
YO
901 Dumps a fully constructed but uncompressed trie in list form.
902 List tries normally only are used for construction when the number of
903 possible chars (trie->uniquecharcount) is very high.
904 Used for debugging make_trie().
905*/
906STATIC void
55eed653 907S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
908 HV *widecharmap, AV *revcharmap, U32 next_alloc,
909 U32 depth)
3dab1dad
YO
910{
911 U32 state;
ab3bbdeb 912 SV *sv=sv_newmortal();
55eed653 913 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
914 GET_RE_DEBUG_FLAGS_DECL;
915 /* print out the table precompression. */
ab3bbdeb
YO
916 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
917 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
918 "------:-----+-----------------\n" );
3dab1dad
YO
919
920 for( state=1 ; state < next_alloc ; state ++ ) {
921 U16 charid;
922
ab3bbdeb 923 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
924 (int)depth * 2 + 2,"", (UV)state );
925 if ( ! trie->states[ state ].wordnum ) {
926 PerlIO_printf( Perl_debug_log, "%5s| ","");
927 } else {
928 PerlIO_printf( Perl_debug_log, "W%4x| ",
929 trie->states[ state ].wordnum
930 );
931 }
932 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 933 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
934 if ( tmp ) {
935 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
936 colwidth,
ddc5bc0f 937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
941 ) ,
1e2e3d02
YO
942 TRIE_LIST_ITEM(state,charid).forid,
943 (UV)TRIE_LIST_ITEM(state,charid).newstate
944 );
945 if (!(charid % 10))
664e119d
RGS
946 PerlIO_printf(Perl_debug_log, "\n%*s| ",
947 (int)((depth * 2) + 14), "");
1e2e3d02 948 }
ab3bbdeb
YO
949 }
950 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
951 }
952}
953
954/*
3dab1dad
YO
955 Dumps a fully constructed but uncompressed trie in table form.
956 This is the normal DFA style state transition table, with a few
957 twists to facilitate compression later.
958 Used for debugging make_trie().
959*/
960STATIC void
55eed653 961S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
962 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 U32 depth)
3dab1dad
YO
964{
965 U32 state;
966 U16 charid;
ab3bbdeb 967 SV *sv=sv_newmortal();
55eed653 968 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
969 GET_RE_DEBUG_FLAGS_DECL;
970
971 /*
972 print out the table precompression so that we can do a visual check
973 that they are identical.
974 */
975
976 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977
978 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 979 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 980 if ( tmp ) {
ab3bbdeb
YO
981 PerlIO_printf( Perl_debug_log, "%*s",
982 colwidth,
ddc5bc0f 983 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
984 PL_colors[0], PL_colors[1],
985 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
986 PERL_PV_ESCAPE_FIRSTCHAR
987 )
988 );
3dab1dad
YO
989 }
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
993
994 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 995 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
996 }
997
998 PerlIO_printf( Perl_debug_log, "\n" );
999
1000 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1001
1002 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1003 (int)depth * 2 + 2,"",
1004 (UV)TRIE_NODENUM( state ) );
1005
1006 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1007 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1008 if (v)
1009 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1010 else
1011 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1012 }
1013 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1014 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1015 } else {
1016 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1017 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1018 }
1019 }
07be1b83 1020}
3dab1dad
YO
1021
1022#endif
1023
786e8c11
YO
1024/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1025 startbranch: the first branch in the whole branch sequence
1026 first : start branch of sequence of branch-exact nodes.
1027 May be the same as startbranch
1028 last : Thing following the last branch.
1029 May be the same as tail.
1030 tail : item following the branch sequence
1031 count : words in the sequence
1032 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1033 depth : indent depth
3dab1dad 1034
786e8c11 1035Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1036
786e8c11
YO
1037A trie is an N'ary tree where the branches are determined by digital
1038decomposition of the key. IE, at the root node you look up the 1st character and
1039follow that branch repeat until you find the end of the branches. Nodes can be
1040marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1041
786e8c11 1042 /he|she|his|hers/
72f13be8 1043
786e8c11
YO
1044would convert into the following structure. Numbers represent states, letters
1045following numbers represent valid transitions on the letter from that state, if
1046the number is in square brackets it represents an accepting state, otherwise it
1047will be in parenthesis.
07be1b83 1048
786e8c11
YO
1049 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1050 | |
1051 | (2)
1052 | |
1053 (1) +-i->(6)-+-s->[7]
1054 |
1055 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1056
786e8c11
YO
1057 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1058
1059This shows that when matching against the string 'hers' we will begin at state 1
1060read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1061then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1062is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1063single traverse. We store a mapping from accepting to state to which word was
1064matched, and then when we have multiple possibilities we try to complete the
1065rest of the regex in the order in which they occured in the alternation.
1066
1067The only prior NFA like behaviour that would be changed by the TRIE support is
1068the silent ignoring of duplicate alternations which are of the form:
1069
1070 / (DUPE|DUPE) X? (?{ ... }) Y /x
1071
1072Thus EVAL blocks follwing a trie may be called a different number of times with
1073and without the optimisation. With the optimisations dupes will be silently
1074ignored. This inconsistant behaviour of EVAL type nodes is well established as
1075the following demonstrates:
1076
1077 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1078
1079which prints out 'word' three times, but
1080
1081 'words'=~/(word|word|word)(?{ print $1 })S/
1082
1083which doesnt print it out at all. This is due to other optimisations kicking in.
1084
1085Example of what happens on a structural level:
1086
1087The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1088
1089 1: CURLYM[1] {1,32767}(18)
1090 5: BRANCH(8)
1091 6: EXACT <ac>(16)
1092 8: BRANCH(11)
1093 9: EXACT <ad>(16)
1094 11: BRANCH(14)
1095 12: EXACT <ab>(16)
1096 16: SUCCEED(0)
1097 17: NOTHING(18)
1098 18: END(0)
1099
1100This would be optimizable with startbranch=5, first=5, last=16, tail=16
1101and should turn into:
1102
1103 1: CURLYM[1] {1,32767}(18)
1104 5: TRIE(16)
1105 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1106 <ac>
1107 <ad>
1108 <ab>
1109 16: SUCCEED(0)
1110 17: NOTHING(18)
1111 18: END(0)
1112
1113Cases where tail != last would be like /(?foo|bar)baz/:
1114
1115 1: BRANCH(4)
1116 2: EXACT <foo>(8)
1117 4: BRANCH(7)
1118 5: EXACT <bar>(8)
1119 7: TAIL(8)
1120 8: EXACT <baz>(10)
1121 10: END(0)
1122
1123which would be optimizable with startbranch=1, first=1, last=7, tail=8
1124and would end up looking like:
1125
1126 1: TRIE(8)
1127 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1128 <foo>
1129 <bar>
1130 7: TAIL(8)
1131 8: EXACT <baz>(10)
1132 10: END(0)
1133
1134 d = uvuni_to_utf8_flags(d, uv, 0);
1135
1136is the recommended Unicode-aware way of saying
1137
1138 *(d++) = uv;
1139*/
1140
1e2e3d02 1141#define TRIE_STORE_REVCHAR \
786e8c11 1142 STMT_START { \
1e2e3d02 1143 SV *tmp = newSVpvs(""); \
786e8c11 1144 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1145 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1146 av_push( revcharmap, tmp ); \
786e8c11
YO
1147 } STMT_END
1148
1149#define TRIE_READ_CHAR STMT_START { \
1150 wordlen++; \
1151 if ( UTF ) { \
1152 if ( folder ) { \
1153 if ( foldlen > 0 ) { \
1154 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1155 foldlen -= len; \
1156 scan += len; \
1157 len = 0; \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1161 foldlen -= UNISKIP( uvc ); \
1162 scan = foldbuf + UNISKIP( uvc ); \
1163 } \
1164 } else { \
1165 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1166 } \
1167 } else { \
1168 uvc = (U32)*uc; \
1169 len = 1; \
1170 } \
1171} STMT_END
1172
1173
1174
1175#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1176 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1177 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1178 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1179 } \
1180 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1181 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1182 TRIE_LIST_CUR( state )++; \
1183} STMT_END
07be1b83 1184
786e8c11
YO
1185#define TRIE_LIST_NEW(state) STMT_START { \
1186 Newxz( trie->states[ state ].trans.list, \
1187 4, reg_trie_trans_le ); \
1188 TRIE_LIST_CUR( state ) = 1; \
1189 TRIE_LIST_LEN( state ) = 4; \
1190} STMT_END
07be1b83 1191
786e8c11
YO
1192#define TRIE_HANDLE_WORD(state) STMT_START { \
1193 U16 dupe= trie->states[ state ].wordnum; \
1194 regnode * const noper_next = regnext( noper ); \
1195 \
1196 if (trie->wordlen) \
1197 trie->wordlen[ curword ] = wordlen; \
1198 DEBUG_r({ \
1199 /* store the word for dumping */ \
1200 SV* tmp; \
1201 if (OP(noper) != NOTHING) \
1202 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1203 else \
1204 tmp = newSVpvn( "", 0 ); \
1205 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1206 av_push( trie_words, tmp ); \
786e8c11
YO
1207 }); \
1208 \
1209 curword++; \
1210 \
1211 if ( noper_next < tail ) { \
1212 if (!trie->jump) \
c944940b 1213 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1214 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1215 if (!jumper) \
1216 jumper = noper_next; \
1217 if (!nextbranch) \
1218 nextbranch= regnext(cur); \
1219 } \
1220 \
1221 if ( dupe ) { \
1222 /* So it's a dupe. This means we need to maintain a */\
1223 /* linked-list from the first to the next. */\
1224 /* we only allocate the nextword buffer when there */\
1225 /* a dupe, so first time we have to do the allocation */\
1226 if (!trie->nextword) \
c944940b 1227 trie->nextword = (U16 *) \
446bd890 1228 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1229 while ( trie->nextword[dupe] ) \
1230 dupe= trie->nextword[dupe]; \
1231 trie->nextword[dupe]= curword; \
1232 } else { \
1233 /* we haven't inserted this word yet. */ \
1234 trie->states[ state ].wordnum = curword; \
1235 } \
1236} STMT_END
07be1b83 1237
3dab1dad 1238
786e8c11
YO
1239#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1240 ( ( base + charid >= ucharcount \
1241 && base + charid < ubound \
1242 && state == trie->trans[ base - ucharcount + charid ].check \
1243 && trie->trans[ base - ucharcount + charid ].next ) \
1244 ? trie->trans[ base - ucharcount + charid ].next \
1245 : ( state==1 ? special : 0 ) \
1246 )
3dab1dad 1247
786e8c11
YO
1248#define MADE_TRIE 1
1249#define MADE_JUMP_TRIE 2
1250#define MADE_EXACT_TRIE 4
3dab1dad 1251
a3621e74 1252STATIC I32
786e8c11 1253S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1254{
27da23d5 1255 dVAR;
a3621e74
YO
1256 /* first pass, loop through and scan words */
1257 reg_trie_data *trie;
55eed653 1258 HV *widecharmap = NULL;
2b8b4781 1259 AV *revcharmap = newAV();
a3621e74 1260 regnode *cur;
9f7f3913 1261 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1262 STRLEN len = 0;
1263 UV uvc = 0;
1264 U16 curword = 0;
1265 U32 next_alloc = 0;
786e8c11
YO
1266 regnode *jumper = NULL;
1267 regnode *nextbranch = NULL;
7f69552c 1268 regnode *convert = NULL;
a3621e74 1269 /* we just use folder as a flag in utf8 */
e1ec3a88 1270 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1271 ? PL_fold
1272 : ( flags == EXACTFL
1273 ? PL_fold_locale
1274 : NULL
1275 )
1276 );
1277
2b8b4781
NC
1278#ifdef DEBUGGING
1279 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1280 AV *trie_words = NULL;
1281 /* along with revcharmap, this only used during construction but both are
1282 * useful during debugging so we store them in the struct when debugging.
8e11feef 1283 */
2b8b4781
NC
1284#else
1285 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1286 STRLEN trie_charcount=0;
3dab1dad 1287#endif
2b8b4781 1288 SV *re_trie_maxbuff;
a3621e74 1289 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1290#ifndef DEBUGGING
1291 PERL_UNUSED_ARG(depth);
1292#endif
a3621e74 1293
c944940b 1294 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1295 trie->refcount = 1;
3dab1dad 1296 trie->startstate = 1;
786e8c11 1297 trie->wordcount = word_count;
f8fc2ecf 1298 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1299 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1300 if (!(UTF && folder))
c944940b 1301 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1302 DEBUG_r({
2b8b4781 1303 trie_words = newAV();
a3621e74 1304 });
a3621e74 1305
0111c4fd 1306 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1307 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1308 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1309 }
3dab1dad
YO
1310 DEBUG_OPTIMISE_r({
1311 PerlIO_printf( Perl_debug_log,
786e8c11 1312 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1313 (int)depth * 2 + 2, "",
1314 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1315 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1316 (int)depth);
3dab1dad 1317 });
7f69552c
YO
1318
1319 /* Find the node we are going to overwrite */
1320 if ( first == startbranch && OP( last ) != BRANCH ) {
1321 /* whole branch chain */
1322 convert = first;
1323 } else {
1324 /* branch sub-chain */
1325 convert = NEXTOPER( first );
1326 }
1327
a3621e74
YO
1328 /* -- First loop and Setup --
1329
1330 We first traverse the branches and scan each word to determine if it
1331 contains widechars, and how many unique chars there are, this is
1332 important as we have to build a table with at least as many columns as we
1333 have unique chars.
1334
1335 We use an array of integers to represent the character codes 0..255
1336 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1337 native representation of the character value as the key and IV's for the
1338 coded index.
1339
1340 *TODO* If we keep track of how many times each character is used we can
1341 remap the columns so that the table compression later on is more
1342 efficient in terms of memory by ensuring most common value is in the
1343 middle and the least common are on the outside. IMO this would be better
1344 than a most to least common mapping as theres a decent chance the most
1345 common letter will share a node with the least common, meaning the node
1346 will not be compressable. With a middle is most common approach the worst
1347 case is when we have the least common nodes twice.
1348
1349 */
1350
a3621e74 1351 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1352 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1353 const U8 *uc = (U8*)STRING( noper );
a28509cc 1354 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1355 STRLEN foldlen = 0;
1356 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1357 const U8 *scan = (U8*)NULL;
07be1b83 1358 U32 wordlen = 0; /* required init */
3dab1dad 1359 STRLEN chars=0;
a3621e74 1360
3dab1dad
YO
1361 if (OP(noper) == NOTHING) {
1362 trie->minlen= 0;
1363 continue;
1364 }
1365 if (trie->bitmap) {
1366 TRIE_BITMAP_SET(trie,*uc);
1367 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1368 }
a3621e74 1369 for ( ; uc < e ; uc += len ) {
3dab1dad 1370 TRIE_CHARCOUNT(trie)++;
a3621e74 1371 TRIE_READ_CHAR;
3dab1dad 1372 chars++;
a3621e74
YO
1373 if ( uvc < 256 ) {
1374 if ( !trie->charmap[ uvc ] ) {
1375 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1376 if ( folder )
1377 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1378 TRIE_STORE_REVCHAR;
a3621e74
YO
1379 }
1380 } else {
1381 SV** svpp;
55eed653
NC
1382 if ( !widecharmap )
1383 widecharmap = newHV();
a3621e74 1384
55eed653 1385 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1386
1387 if ( !svpp )
e4584336 1388 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1389
1390 if ( !SvTRUE( *svpp ) ) {
1391 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1392 TRIE_STORE_REVCHAR;
a3621e74
YO
1393 }
1394 }
1395 }
3dab1dad
YO
1396 if( cur == first ) {
1397 trie->minlen=chars;
1398 trie->maxlen=chars;
1399 } else if (chars < trie->minlen) {
1400 trie->minlen=chars;
1401 } else if (chars > trie->maxlen) {
1402 trie->maxlen=chars;
1403 }
1404
a3621e74
YO
1405 } /* end first pass */
1406 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1407 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1408 (int)depth * 2 + 2,"",
55eed653 1409 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1410 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1411 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1412 );
c944940b 1413 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1414
1415 /*
1416 We now know what we are dealing with in terms of unique chars and
1417 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1418 representation using a flat table will take. If it's over a reasonable
1419 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1420 conservative but potentially much slower representation using an array
1421 of lists.
1422
1423 At the end we convert both representations into the same compressed
1424 form that will be used in regexec.c for matching with. The latter
1425 is a form that cannot be used to construct with but has memory
1426 properties similar to the list form and access properties similar
1427 to the table form making it both suitable for fast searches and
1428 small enough that its feasable to store for the duration of a program.
1429
1430 See the comment in the code where the compressed table is produced
1431 inplace from the flat tabe representation for an explanation of how
1432 the compression works.
1433
1434 */
1435
1436
3dab1dad 1437 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1438 /*
1439 Second Pass -- Array Of Lists Representation
1440
1441 Each state will be represented by a list of charid:state records
1442 (reg_trie_trans_le) the first such element holds the CUR and LEN
1443 points of the allocated array. (See defines above).
1444
1445 We build the initial structure using the lists, and then convert
1446 it into the compressed table form which allows faster lookups
1447 (but cant be modified once converted).
a3621e74
YO
1448 */
1449
a3621e74
YO
1450 STRLEN transcount = 1;
1451
1e2e3d02
YO
1452 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1453 "%*sCompiling trie using list compiler\n",
1454 (int)depth * 2 + 2, ""));
446bd890 1455
c944940b
JH
1456 trie->states = (reg_trie_state *)
1457 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1458 sizeof(reg_trie_state) );
a3621e74
YO
1459 TRIE_LIST_NEW(1);
1460 next_alloc = 2;
1461
1462 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1463
c445ea15
AL
1464 regnode * const noper = NEXTOPER( cur );
1465 U8 *uc = (U8*)STRING( noper );
1466 const U8 * const e = uc + STR_LEN( noper );
1467 U32 state = 1; /* required init */
1468 U16 charid = 0; /* sanity init */
1469 U8 *scan = (U8*)NULL; /* sanity init */
1470 STRLEN foldlen = 0; /* required init */
07be1b83 1471 U32 wordlen = 0; /* required init */
c445ea15
AL
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1473
3dab1dad 1474 if (OP(noper) != NOTHING) {
786e8c11 1475 for ( ; uc < e ; uc += len ) {
c445ea15 1476
786e8c11 1477 TRIE_READ_CHAR;
c445ea15 1478
786e8c11
YO
1479 if ( uvc < 256 ) {
1480 charid = trie->charmap[ uvc ];
c445ea15 1481 } else {
55eed653 1482 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1483 if ( !svpp ) {
1484 charid = 0;
1485 } else {
1486 charid=(U16)SvIV( *svpp );
1487 }
c445ea15 1488 }
786e8c11
YO
1489 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1490 if ( charid ) {
a3621e74 1491
786e8c11
YO
1492 U16 check;
1493 U32 newstate = 0;
a3621e74 1494
786e8c11
YO
1495 charid--;
1496 if ( !trie->states[ state ].trans.list ) {
1497 TRIE_LIST_NEW( state );
c445ea15 1498 }
786e8c11
YO
1499 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1500 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1501 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1502 break;
1503 }
1504 }
1505 if ( ! newstate ) {
1506 newstate = next_alloc++;
1507 TRIE_LIST_PUSH( state, charid, newstate );
1508 transcount++;
1509 }
1510 state = newstate;
1511 } else {
1512 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1513 }
a28509cc 1514 }
c445ea15 1515 }
3dab1dad 1516 TRIE_HANDLE_WORD(state);
a3621e74
YO
1517
1518 } /* end second pass */
1519
1e2e3d02
YO
1520 /* next alloc is the NEXT state to be allocated */
1521 trie->statecount = next_alloc;
c944940b
JH
1522 trie->states = (reg_trie_state *)
1523 PerlMemShared_realloc( trie->states,
1524 next_alloc
1525 * sizeof(reg_trie_state) );
a3621e74 1526
3dab1dad 1527 /* and now dump it out before we compress it */
2b8b4781
NC
1528 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1529 revcharmap, next_alloc,
1530 depth+1)
1e2e3d02 1531 );
a3621e74 1532
c944940b
JH
1533 trie->trans = (reg_trie_trans *)
1534 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1535 {
1536 U32 state;
a3621e74
YO
1537 U32 tp = 0;
1538 U32 zp = 0;
1539
1540
1541 for( state=1 ; state < next_alloc ; state ++ ) {
1542 U32 base=0;
1543
1544 /*
1545 DEBUG_TRIE_COMPILE_MORE_r(
1546 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1547 );
1548 */
1549
1550 if (trie->states[state].trans.list) {
1551 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1552 U16 maxid=minid;
a28509cc 1553 U16 idx;
a3621e74
YO
1554
1555 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1556 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1557 if ( forid < minid ) {
1558 minid=forid;
1559 } else if ( forid > maxid ) {
1560 maxid=forid;
1561 }
a3621e74
YO
1562 }
1563 if ( transcount < tp + maxid - minid + 1) {
1564 transcount *= 2;
c944940b
JH
1565 trie->trans = (reg_trie_trans *)
1566 PerlMemShared_realloc( trie->trans,
446bd890
NC
1567 transcount
1568 * sizeof(reg_trie_trans) );
a3621e74
YO
1569 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1570 }
1571 base = trie->uniquecharcount + tp - minid;
1572 if ( maxid == minid ) {
1573 U32 set = 0;
1574 for ( ; zp < tp ; zp++ ) {
1575 if ( ! trie->trans[ zp ].next ) {
1576 base = trie->uniquecharcount + zp - minid;
1577 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ zp ].check = state;
1579 set = 1;
1580 break;
1581 }
1582 }
1583 if ( !set ) {
1584 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1585 trie->trans[ tp ].check = state;
1586 tp++;
1587 zp = tp;
1588 }
1589 } else {
1590 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1591 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1592 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1593 trie->trans[ tid ].check = state;
1594 }
1595 tp += ( maxid - minid + 1 );
1596 }
1597 Safefree(trie->states[ state ].trans.list);
1598 }
1599 /*
1600 DEBUG_TRIE_COMPILE_MORE_r(
1601 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1602 );
1603 */
1604 trie->states[ state ].trans.base=base;
1605 }
cc601c31 1606 trie->lasttrans = tp + 1;
a3621e74
YO
1607 }
1608 } else {
1609 /*
1610 Second Pass -- Flat Table Representation.
1611
1612 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1613 We know that we will need Charcount+1 trans at most to store the data
1614 (one row per char at worst case) So we preallocate both structures
1615 assuming worst case.
1616
1617 We then construct the trie using only the .next slots of the entry
1618 structs.
1619
1620 We use the .check field of the first entry of the node temporarily to
1621 make compression both faster and easier by keeping track of how many non
1622 zero fields are in the node.
1623
1624 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1625 transition.
1626
1627 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1628 number representing the first entry of the node, and state as a
1629 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1630 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1631 are 2 entrys per node. eg:
1632
1633 A B A B
1634 1. 2 4 1. 3 7
1635 2. 0 3 3. 0 5
1636 3. 0 0 5. 0 0
1637 4. 0 0 7. 0 0
1638
1639 The table is internally in the right hand, idx form. However as we also
1640 have to deal with the states array which is indexed by nodenum we have to
1641 use TRIE_NODENUM() to convert.
1642
1643 */
1e2e3d02
YO
1644 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1645 "%*sCompiling trie using table compiler\n",
1646 (int)depth * 2 + 2, ""));
3dab1dad 1647
c944940b
JH
1648 trie->trans = (reg_trie_trans *)
1649 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1650 * trie->uniquecharcount + 1,
1651 sizeof(reg_trie_trans) );
1652 trie->states = (reg_trie_state *)
1653 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1654 sizeof(reg_trie_state) );
a3621e74
YO
1655 next_alloc = trie->uniquecharcount + 1;
1656
3dab1dad 1657
a3621e74
YO
1658 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1659
c445ea15 1660 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1661 const U8 *uc = (U8*)STRING( noper );
1662 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1663
1664 U32 state = 1; /* required init */
1665
1666 U16 charid = 0; /* sanity init */
1667 U32 accept_state = 0; /* sanity init */
1668 U8 *scan = (U8*)NULL; /* sanity init */
1669
1670 STRLEN foldlen = 0; /* required init */
07be1b83 1671 U32 wordlen = 0; /* required init */
a3621e74
YO
1672 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1673
3dab1dad 1674 if ( OP(noper) != NOTHING ) {
786e8c11 1675 for ( ; uc < e ; uc += len ) {
a3621e74 1676
786e8c11 1677 TRIE_READ_CHAR;
a3621e74 1678
786e8c11
YO
1679 if ( uvc < 256 ) {
1680 charid = trie->charmap[ uvc ];
1681 } else {
55eed653 1682 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1683 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1684 }
786e8c11
YO
1685 if ( charid ) {
1686 charid--;
1687 if ( !trie->trans[ state + charid ].next ) {
1688 trie->trans[ state + charid ].next = next_alloc;
1689 trie->trans[ state ].check++;
1690 next_alloc += trie->uniquecharcount;
1691 }
1692 state = trie->trans[ state + charid ].next;
1693 } else {
1694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1695 }
1696 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1697 }
a3621e74 1698 }
3dab1dad
YO
1699 accept_state = TRIE_NODENUM( state );
1700 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1701
1702 } /* end second pass */
1703
3dab1dad 1704 /* and now dump it out before we compress it */
2b8b4781
NC
1705 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1706 revcharmap,
1707 next_alloc, depth+1));
a3621e74 1708
a3621e74
YO
1709 {
1710 /*
1711 * Inplace compress the table.*
1712
1713 For sparse data sets the table constructed by the trie algorithm will
1714 be mostly 0/FAIL transitions or to put it another way mostly empty.
1715 (Note that leaf nodes will not contain any transitions.)
1716
1717 This algorithm compresses the tables by eliminating most such
1718 transitions, at the cost of a modest bit of extra work during lookup:
1719
1720 - Each states[] entry contains a .base field which indicates the
1721 index in the state[] array wheres its transition data is stored.
1722
1723 - If .base is 0 there are no valid transitions from that node.
1724
1725 - If .base is nonzero then charid is added to it to find an entry in
1726 the trans array.
1727
1728 -If trans[states[state].base+charid].check!=state then the
1729 transition is taken to be a 0/Fail transition. Thus if there are fail
1730 transitions at the front of the node then the .base offset will point
1731 somewhere inside the previous nodes data (or maybe even into a node
1732 even earlier), but the .check field determines if the transition is
1733 valid.
1734
786e8c11 1735 XXX - wrong maybe?
a3621e74
YO
1736 The following process inplace converts the table to the compressed
1737 table: We first do not compress the root node 1,and mark its all its
1738 .check pointers as 1 and set its .base pointer as 1 as well. This
1739 allows to do a DFA construction from the compressed table later, and
1740 ensures that any .base pointers we calculate later are greater than
1741 0.
1742
1743 - We set 'pos' to indicate the first entry of the second node.
1744
1745 - We then iterate over the columns of the node, finding the first and
1746 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1747 and set the .check pointers accordingly, and advance pos
1748 appropriately and repreat for the next node. Note that when we copy
1749 the next pointers we have to convert them from the original
1750 NODEIDX form to NODENUM form as the former is not valid post
1751 compression.
1752
1753 - If a node has no transitions used we mark its base as 0 and do not
1754 advance the pos pointer.
1755
1756 - If a node only has one transition we use a second pointer into the
1757 structure to fill in allocated fail transitions from other states.
1758 This pointer is independent of the main pointer and scans forward
1759 looking for null transitions that are allocated to a state. When it
1760 finds one it writes the single transition into the "hole". If the
786e8c11 1761 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1762
1763 - Once compressed we can Renew/realloc the structures to release the
1764 excess space.
1765
1766 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1767 specifically Fig 3.47 and the associated pseudocode.
1768
1769 demq
1770 */
a3b680e6 1771 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1772 U32 state, charid;
a3621e74 1773 U32 pos = 0, zp=0;
1e2e3d02 1774 trie->statecount = laststate;
a3621e74
YO
1775
1776 for ( state = 1 ; state < laststate ; state++ ) {
1777 U8 flag = 0;
a28509cc
AL
1778 const U32 stateidx = TRIE_NODEIDX( state );
1779 const U32 o_used = trie->trans[ stateidx ].check;
1780 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1781 trie->trans[ stateidx ].check = 0;
1782
1783 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1784 if ( flag || trie->trans[ stateidx + charid ].next ) {
1785 if ( trie->trans[ stateidx + charid ].next ) {
1786 if (o_used == 1) {
1787 for ( ; zp < pos ; zp++ ) {
1788 if ( ! trie->trans[ zp ].next ) {
1789 break;
1790 }
1791 }
1792 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1793 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1794 trie->trans[ zp ].check = state;
1795 if ( ++zp > pos ) pos = zp;
1796 break;
1797 }
1798 used--;
1799 }
1800 if ( !flag ) {
1801 flag = 1;
1802 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1803 }
1804 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1805 trie->trans[ pos ].check = state;
1806 pos++;
1807 }
1808 }
1809 }
cc601c31 1810 trie->lasttrans = pos + 1;
c944940b
JH
1811 trie->states = (reg_trie_state *)
1812 PerlMemShared_realloc( trie->states, laststate
1813 * sizeof(reg_trie_state) );
a3621e74 1814 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1815 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1816 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1817 (int)depth * 2 + 2,"",
1818 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1819 (IV)next_alloc,
1820 (IV)pos,
a3621e74
YO
1821 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1822 );
1823
1824 } /* end table compress */
1825 }
1e2e3d02
YO
1826 DEBUG_TRIE_COMPILE_MORE_r(
1827 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1828 (int)depth * 2 + 2, "",
1829 (UV)trie->statecount,
1830 (UV)trie->lasttrans)
1831 );
cc601c31 1832 /* resize the trans array to remove unused space */
c944940b
JH
1833 trie->trans = (reg_trie_trans *)
1834 PerlMemShared_realloc( trie->trans, trie->lasttrans
1835 * sizeof(reg_trie_trans) );
a3621e74 1836
3dab1dad 1837 /* and now dump out the compressed format */
2b8b4781 1838 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1839
3dab1dad 1840 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1841 U8 nodetype =(U8)(flags & 0xFF);
1842 char *str=NULL;
786e8c11 1843
07be1b83 1844#ifdef DEBUGGING
e62cc96a 1845 regnode *optimize = NULL;
b57a0404
JH
1846 U32 mjd_offset = 0;
1847 U32 mjd_nodelen = 0;
07be1b83 1848#endif
a3621e74 1849 /*
3dab1dad
YO
1850 This means we convert either the first branch or the first Exact,
1851 depending on whether the thing following (in 'last') is a branch
1852 or not and whther first is the startbranch (ie is it a sub part of
1853 the alternation or is it the whole thing.)
1854 Assuming its a sub part we conver the EXACT otherwise we convert
1855 the whole branch sequence, including the first.
a3621e74 1856 */
3dab1dad 1857 /* Find the node we are going to overwrite */
7f69552c 1858 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1859 /* branch sub-chain */
3dab1dad 1860 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1861 DEBUG_r({
1862 mjd_offset= Node_Offset((convert));
1863 mjd_nodelen= Node_Length((convert));
1864 });
7f69552c
YO
1865 /* whole branch chain */
1866 } else {
1867 DEBUG_r({
1868 const regnode *nop = NEXTOPER( convert );
1869 mjd_offset= Node_Offset((nop));
1870 mjd_nodelen= Node_Length((nop));
1871 });
07be1b83 1872 }
7f69552c 1873
07be1b83
YO
1874 DEBUG_OPTIMISE_r(
1875 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1876 (int)depth * 2 + 2, "",
786e8c11 1877 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1878 );
a3621e74 1879
3dab1dad
YO
1880 /* But first we check to see if there is a common prefix we can
1881 split out as an EXACT and put in front of the TRIE node. */
1882 trie->startstate= 1;
55eed653 1883 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1884 U32 state;
1e2e3d02 1885 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1886 U32 ofs = 0;
8e11feef
RGS
1887 I32 idx = -1;
1888 U32 count = 0;
1889 const U32 base = trie->states[ state ].trans.base;
a3621e74 1890
3dab1dad 1891 if ( trie->states[state].wordnum )
8e11feef 1892 count = 1;
a3621e74 1893
8e11feef 1894 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1895 if ( ( base + ofs >= trie->uniquecharcount ) &&
1896 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1897 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1898 {
3dab1dad 1899 if ( ++count > 1 ) {
2b8b4781 1900 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1901 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1902 if ( state == 1 ) break;
3dab1dad
YO
1903 if ( count == 2 ) {
1904 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1905 DEBUG_OPTIMISE_r(
8e11feef
RGS
1906 PerlIO_printf(Perl_debug_log,
1907 "%*sNew Start State=%"UVuf" Class: [",
1908 (int)depth * 2 + 2, "",
786e8c11 1909 (UV)state));
be8e71aa 1910 if (idx >= 0) {
2b8b4781 1911 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1912 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1913
3dab1dad 1914 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1915 if ( folder )
1916 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1917 DEBUG_OPTIMISE_r(
07be1b83 1918 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1919 );
8e11feef
RGS
1920 }
1921 }
1922 TRIE_BITMAP_SET(trie,*ch);
1923 if ( folder )
1924 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1925 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1926 }
1927 idx = ofs;
1928 }
3dab1dad
YO
1929 }
1930 if ( count == 1 ) {
2b8b4781 1931 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1932 char *ch = SvPV_nolen( *tmp );
1933 DEBUG_OPTIMISE_r({
1934 SV *sv=sv_newmortal();
8e11feef
RGS
1935 PerlIO_printf( Perl_debug_log,
1936 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1937 (int)depth * 2 + 2, "",
de734bd5
A
1938 (UV)state, (UV)idx,
1939 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1940 PL_colors[0], PL_colors[1],
1941 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1942 PERL_PV_ESCAPE_FIRSTCHAR
1943 )
1944 );
1945 });
3dab1dad
YO
1946 if ( state==1 ) {
1947 OP( convert ) = nodetype;
1948 str=STRING(convert);
1949 STR_LEN(convert)=0;
1950 }
de734bd5
A
1951 while (*ch) {
1952 *str++ = *ch++;
1953 STR_LEN(convert)++;
1954 }
1955
8e11feef 1956 } else {
f9049ba1 1957#ifdef DEBUGGING
8e11feef
RGS
1958 if (state>1)
1959 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1960#endif
8e11feef
RGS
1961 break;
1962 }
1963 }
3dab1dad 1964 if (str) {
8e11feef 1965 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1966 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1967 trie->startstate = state;
07be1b83
YO
1968 trie->minlen -= (state - 1);
1969 trie->maxlen -= (state - 1);
1970 DEBUG_r({
1971 regnode *fix = convert;
de734bd5 1972 U32 word = trie->wordcount;
07be1b83
YO
1973 mjd_nodelen++;
1974 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1975 while( ++fix < n ) {
1976 Set_Node_Offset_Length(fix, 0, 0);
1977 }
de734bd5 1978 while (word--) {
2b8b4781 1979 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1980 if (tmp) {
1981 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1982 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1983 else
1984 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1985 }
1986 }
07be1b83 1987 });
8e11feef
RGS
1988 if (trie->maxlen) {
1989 convert = n;
1990 } else {
3dab1dad 1991 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1992 DEBUG_r(optimize= n);
3dab1dad
YO
1993 }
1994 }
1995 }
a5ca303d
YO
1996 if (!jumper)
1997 jumper = last;
3dab1dad 1998 if ( trie->maxlen ) {
8e11feef
RGS
1999 NEXT_OFF( convert ) = (U16)(tail - convert);
2000 ARG_SET( convert, data_slot );
786e8c11
YO
2001 /* Store the offset to the first unabsorbed branch in
2002 jump[0], which is otherwise unused by the jump logic.
2003 We use this when dumping a trie and during optimisation. */
2004 if (trie->jump)
7f69552c 2005 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2006
786e8c11
YO
2007 /* XXXX */
2008 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2009 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2010 {
2011 OP( convert ) = TRIEC;
2012 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2013 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2014 trie->bitmap= NULL;
2015 } else
2016 OP( convert ) = TRIE;
a3621e74 2017
3dab1dad
YO
2018 /* store the type in the flags */
2019 convert->flags = nodetype;
a5ca303d
YO
2020 DEBUG_r({
2021 optimize = convert
2022 + NODE_STEP_REGNODE
2023 + regarglen[ OP( convert ) ];
2024 });
2025 /* XXX We really should free up the resource in trie now,
2026 as we won't use them - (which resources?) dmq */
3dab1dad 2027 }
a3621e74 2028 /* needed for dumping*/
e62cc96a 2029 DEBUG_r(if (optimize) {
07be1b83 2030 regnode *opt = convert;
e62cc96a 2031 while ( ++opt < optimize) {
07be1b83
YO
2032 Set_Node_Offset_Length(opt,0,0);
2033 }
786e8c11
YO
2034 /*
2035 Try to clean up some of the debris left after the
2036 optimisation.
a3621e74 2037 */
786e8c11 2038 while( optimize < jumper ) {
07be1b83 2039 mjd_nodelen += Node_Length((optimize));
a3621e74 2040 OP( optimize ) = OPTIMIZED;
07be1b83 2041 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2042 optimize++;
2043 }
07be1b83 2044 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2045 });
2046 } /* end node insert */
55eed653 2047 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2048#ifdef DEBUGGING
2049 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2050 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2051#else
2052 SvREFCNT_dec(revcharmap);
07be1b83 2053#endif
786e8c11
YO
2054 return trie->jump
2055 ? MADE_JUMP_TRIE
2056 : trie->startstate>1
2057 ? MADE_EXACT_TRIE
2058 : MADE_TRIE;
2059}
2060
2061STATIC void
2062S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2063{
2064/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2065
2066 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2067 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2068 ISBN 0-201-10088-6
2069
2070 We find the fail state for each state in the trie, this state is the longest proper
2071 suffix of the current states 'word' that is also a proper prefix of another word in our
2072 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2073 the DFA not to have to restart after its tried and failed a word at a given point, it
2074 simply continues as though it had been matching the other word in the first place.
2075 Consider
2076 'abcdgu'=~/abcdefg|cdgu/
2077 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2078 fail, which would bring use to the state representing 'd' in the second word where we would
2079 try 'g' and succeed, prodceding to match 'cdgu'.
2080 */
2081 /* add a fail transition */
3251b653
NC
2082 const U32 trie_offset = ARG(source);
2083 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2084 U32 *q;
2085 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2086 const U32 numstates = trie->statecount;
786e8c11
YO
2087 const U32 ubound = trie->lasttrans + ucharcount;
2088 U32 q_read = 0;
2089 U32 q_write = 0;
2090 U32 charid;
2091 U32 base = trie->states[ 1 ].trans.base;
2092 U32 *fail;
2093 reg_ac_data *aho;
2094 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2095 GET_RE_DEBUG_FLAGS_DECL;
2096#ifndef DEBUGGING
2097 PERL_UNUSED_ARG(depth);
2098#endif
2099
2100
2101 ARG_SET( stclass, data_slot );
c944940b 2102 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2103 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2104 aho->trie=trie_offset;
446bd890
NC
2105 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2106 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2107 Newxz( q, numstates, U32);
c944940b 2108 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2109 aho->refcount = 1;
2110 fail = aho->fail;
2111 /* initialize fail[0..1] to be 1 so that we always have
2112 a valid final fail state */
2113 fail[ 0 ] = fail[ 1 ] = 1;
2114
2115 for ( charid = 0; charid < ucharcount ; charid++ ) {
2116 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2117 if ( newstate ) {
2118 q[ q_write ] = newstate;
2119 /* set to point at the root */
2120 fail[ q[ q_write++ ] ]=1;
2121 }
2122 }
2123 while ( q_read < q_write) {
2124 const U32 cur = q[ q_read++ % numstates ];
2125 base = trie->states[ cur ].trans.base;
2126
2127 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2128 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2129 if (ch_state) {
2130 U32 fail_state = cur;
2131 U32 fail_base;
2132 do {
2133 fail_state = fail[ fail_state ];
2134 fail_base = aho->states[ fail_state ].trans.base;
2135 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2136
2137 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2138 fail[ ch_state ] = fail_state;
2139 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2140 {
2141 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2142 }
2143 q[ q_write++ % numstates] = ch_state;
2144 }
2145 }
2146 }
2147 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2148 when we fail in state 1, this allows us to use the
2149 charclass scan to find a valid start char. This is based on the principle
2150 that theres a good chance the string being searched contains lots of stuff
2151 that cant be a start char.
2152 */
2153 fail[ 0 ] = fail[ 1 ] = 0;
2154 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2155 PerlIO_printf(Perl_debug_log,
2156 "%*sStclass Failtable (%"UVuf" states): 0",
2157 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2158 );
786e8c11
YO
2159 for( q_read=1; q_read<numstates; q_read++ ) {
2160 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2161 }
2162 PerlIO_printf(Perl_debug_log, "\n");
2163 });
2164 Safefree(q);
2165 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2166}
2167
786e8c11 2168
a3621e74 2169/*
5d1c421c
JH
2170 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2171 * These need to be revisited when a newer toolchain becomes available.
2172 */
2173#if defined(__sparc64__) && defined(__GNUC__)
2174# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2175# undef SPARC64_GCC_WORKAROUND
2176# define SPARC64_GCC_WORKAROUND 1
2177# endif
2178#endif
2179
07be1b83 2180#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2181 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2182 SV * const mysv=sv_newmortal(); \
2183 regnode *Next = regnext(scan); \
2184 regprop(RExC_rx, mysv, scan); \
7f69552c 2185 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2186 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2187 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2188 }});
07be1b83 2189
1de06328
YO
2190
2191
2192
2193
07be1b83
YO
2194#define JOIN_EXACT(scan,min,flags) \
2195 if (PL_regkind[OP(scan)] == EXACT) \
2196 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2197
be8e71aa 2198STATIC U32
07be1b83
YO
2199S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2200 /* Merge several consecutive EXACTish nodes into one. */
2201 regnode *n = regnext(scan);
2202 U32 stringok = 1;
2203 regnode *next = scan + NODE_SZ_STR(scan);
2204 U32 merged = 0;
2205 U32 stopnow = 0;
2206#ifdef DEBUGGING
2207 regnode *stop = scan;
72f13be8 2208 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2209#else
d47053eb
RGS
2210 PERL_UNUSED_ARG(depth);
2211#endif
2212#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2213 PERL_UNUSED_ARG(flags);
2214 PERL_UNUSED_ARG(val);
07be1b83 2215#endif
07be1b83
YO
2216 DEBUG_PEEP("join",scan,depth);
2217
2218 /* Skip NOTHING, merge EXACT*. */
2219 while (n &&
2220 ( PL_regkind[OP(n)] == NOTHING ||
2221 (stringok && (OP(n) == OP(scan))))
2222 && NEXT_OFF(n)
2223 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2224
2225 if (OP(n) == TAIL || n > next)
2226 stringok = 0;
2227 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2228 DEBUG_PEEP("skip:",n,depth);
2229 NEXT_OFF(scan) += NEXT_OFF(n);
2230 next = n + NODE_STEP_REGNODE;
2231#ifdef DEBUGGING
2232 if (stringok)
2233 stop = n;
2234#endif
2235 n = regnext(n);
2236 }
2237 else if (stringok) {
786e8c11 2238 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2239 regnode * const nnext = regnext(n);
2240
2241 DEBUG_PEEP("merg",n,depth);
2242
2243 merged++;
2244 if (oldl + STR_LEN(n) > U8_MAX)
2245 break;
2246 NEXT_OFF(scan) += NEXT_OFF(n);
2247 STR_LEN(scan) += STR_LEN(n);
2248 next = n + NODE_SZ_STR(n);
2249 /* Now we can overwrite *n : */
2250 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2251#ifdef DEBUGGING
2252 stop = next - 1;
2253#endif
2254 n = nnext;
2255 if (stopnow) break;
2256 }
2257
d47053eb
RGS
2258#ifdef EXPERIMENTAL_INPLACESCAN
2259 if (flags && !NEXT_OFF(n)) {
2260 DEBUG_PEEP("atch", val, depth);
2261 if (reg_off_by_arg[OP(n)]) {
2262 ARG_SET(n, val - n);
2263 }
2264 else {
2265 NEXT_OFF(n) = val - n;
2266 }
2267 stopnow = 1;
2268 }
07be1b83
YO
2269#endif
2270 }
2271
2272 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2273 /*
2274 Two problematic code points in Unicode casefolding of EXACT nodes:
2275
2276 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2277 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2278
2279 which casefold to
2280
2281 Unicode UTF-8
2282
2283 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2284 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2285
2286 This means that in case-insensitive matching (or "loose matching",
2287 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2288 length of the above casefolded versions) can match a target string
2289 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2290 This would rather mess up the minimum length computation.
2291
2292 What we'll do is to look for the tail four bytes, and then peek
2293 at the preceding two bytes to see whether we need to decrease
2294 the minimum length by four (six minus two).
2295
2296 Thanks to the design of UTF-8, there cannot be false matches:
2297 A sequence of valid UTF-8 bytes cannot be a subsequence of
2298 another valid sequence of UTF-8 bytes.
2299
2300 */
2301 char * const s0 = STRING(scan), *s, *t;
2302 char * const s1 = s0 + STR_LEN(scan) - 1;
2303 char * const s2 = s1 - 4;
e294cc5d
JH
2304#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2305 const char t0[] = "\xaf\x49\xaf\x42";
2306#else
07be1b83 2307 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2308#endif
07be1b83
YO
2309 const char * const t1 = t0 + 3;
2310
2311 for (s = s0 + 2;
2312 s < s2 && (t = ninstr(s, s1, t0, t1));
2313 s = t + 4) {
e294cc5d
JH
2314#ifdef EBCDIC
2315 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2316 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2317#else
07be1b83
YO
2318 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2319 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2320#endif
07be1b83
YO
2321 *min -= 4;
2322 }
2323 }
2324
2325#ifdef DEBUGGING
2326 /* Allow dumping */
2327 n = scan + NODE_SZ_STR(scan);
2328 while (n <= stop) {
2329 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2330 OP(n) = OPTIMIZED;
2331 NEXT_OFF(n) = 0;
2332 }
2333 n++;
2334 }
2335#endif
2336 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2337 return stopnow;
2338}
2339
653099ff
GS
2340/* REx optimizer. Converts nodes into quickier variants "in place".
2341 Finds fixed substrings. */
2342
a0288114 2343/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2344 to the position after last scanned or to NULL. */
2345
40d049e4
YO
2346#define INIT_AND_WITHP \
2347 assert(!and_withp); \
2348 Newx(and_withp,1,struct regnode_charclass_class); \
2349 SAVEFREEPV(and_withp)
07be1b83 2350
b515a41d
YO
2351/* this is a chain of data about sub patterns we are processing that
2352 need to be handled seperately/specially in study_chunk. Its so
2353 we can simulate recursion without losing state. */
2354struct scan_frame;
2355typedef struct scan_frame {
2356 regnode *last; /* last node to process in this frame */
2357 regnode *next; /* next node to process when last is reached */
2358 struct scan_frame *prev; /*previous frame*/
2359 I32 stop; /* what stopparen do we use */
2360} scan_frame;
2361
304ee84b
YO
2362
2363#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2364
76e3520e 2365STATIC I32
40d049e4 2366S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2367 I32 *minlenp, I32 *deltap,
40d049e4
YO
2368 regnode *last,
2369 scan_data_t *data,
2370 I32 stopparen,
2371 U8* recursed,
2372 struct regnode_charclass_class *and_withp,
2373 U32 flags, U32 depth)
c277df42
IZ
2374 /* scanp: Start here (read-write). */
2375 /* deltap: Write maxlen-minlen here. */
2376 /* last: Stop before this one. */
40d049e4
YO
2377 /* data: string data about the pattern */
2378 /* stopparen: treat close N as END */
2379 /* recursed: which subroutines have we recursed into */
2380 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2381{
97aff369 2382 dVAR;
c277df42
IZ
2383 I32 min = 0, pars = 0, code;
2384 regnode *scan = *scanp, *next;
2385 I32 delta = 0;
2386 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2387 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2388 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2389 scan_data_t data_fake;
a3621e74 2390 SV *re_trie_maxbuff = NULL;
786e8c11 2391 regnode *first_non_open = scan;
e2e6a0f1 2392 I32 stopmin = I32_MAX;
8aa23a47
YO
2393 scan_frame *frame = NULL;
2394
a3621e74 2395 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2396
13a24bad 2397#ifdef DEBUGGING
40d049e4 2398 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2399#endif
40d049e4 2400
786e8c11 2401 if ( depth == 0 ) {
40d049e4 2402 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2403 first_non_open=regnext(first_non_open);
2404 }
2405
b81d288d 2406
8aa23a47
YO
2407 fake_study_recurse:
2408 while ( scan && OP(scan) != END && scan < last ){
2409 /* Peephole optimizer: */
304ee84b 2410 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2411 DEBUG_PEEP("Peep",scan,depth);
2412 JOIN_EXACT(scan,&min,0);
2413
2414 /* Follow the next-chain of the current node and optimize
2415 away all the NOTHINGs from it. */
2416 if (OP(scan) != CURLYX) {
2417 const int max = (reg_off_by_arg[OP(scan)]
2418 ? I32_MAX
2419 /* I32 may be smaller than U16 on CRAYs! */
2420 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2421 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2422 int noff;
2423 regnode *n = scan;
2424
2425 /* Skip NOTHING and LONGJMP. */
2426 while ((n = regnext(n))
2427 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2428 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2429 && off + noff < max)
2430 off += noff;
2431 if (reg_off_by_arg[OP(scan)])
2432 ARG(scan) = off;
2433 else
2434 NEXT_OFF(scan) = off;
2435 }
a3621e74 2436
c277df42 2437
8aa23a47
YO
2438
2439 /* The principal pseudo-switch. Cannot be a switch, since we
2440 look into several different things. */
2441 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2442 || OP(scan) == IFTHEN) {
2443 next = regnext(scan);
2444 code = OP(scan);
2445 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2446
2447 if (OP(next) == code || code == IFTHEN) {
2448 /* NOTE - There is similar code to this block below for handling
2449 TRIE nodes on a re-study. If you change stuff here check there
2450 too. */
2451 I32 max1 = 0, min1 = I32_MAX, num = 0;
2452 struct regnode_charclass_class accum;
2453 regnode * const startbranch=scan;
2454
2455 if (flags & SCF_DO_SUBSTR)
304ee84b 2456 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2457 if (flags & SCF_DO_STCLASS)
2458 cl_init_zero(pRExC_state, &accum);
2459
2460 while (OP(scan) == code) {
2461 I32 deltanext, minnext, f = 0, fake;
2462 struct regnode_charclass_class this_class;
2463
2464 num++;
2465 data_fake.flags = 0;
2466 if (data) {
2467 data_fake.whilem_c = data->whilem_c;
2468 data_fake.last_closep = data->last_closep;
2469 }
2470 else
2471 data_fake.last_closep = &fake;
58e23c8d
YO
2472
2473 data_fake.pos_delta = delta;
8aa23a47
YO
2474 next = regnext(scan);
2475 scan = NEXTOPER(scan);
2476 if (code != BRANCH)
c277df42 2477 scan = NEXTOPER(scan);
8aa23a47
YO
2478 if (flags & SCF_DO_STCLASS) {
2479 cl_init(pRExC_state, &this_class);
2480 data_fake.start_class = &this_class;
2481 f = SCF_DO_STCLASS_AND;
58e23c8d 2482 }
8aa23a47
YO
2483 if (flags & SCF_WHILEM_VISITED_POS)
2484 f |= SCF_WHILEM_VISITED_POS;
2485
2486 /* we suppose the run is continuous, last=next...*/
2487 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2488 next, &data_fake,
2489 stopparen, recursed, NULL, f,depth+1);
2490 if (min1 > minnext)
2491 min1 = minnext;
2492 if (max1 < minnext + deltanext)
2493 max1 = minnext + deltanext;
2494 if (deltanext == I32_MAX)
2495 is_inf = is_inf_internal = 1;
2496 scan = next;
2497 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2498 pars++;
2499 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2500 if ( stopmin > minnext)
2501 stopmin = min + min1;
2502 flags &= ~SCF_DO_SUBSTR;
2503 if (data)
2504 data->flags |= SCF_SEEN_ACCEPT;
2505 }
2506 if (data) {
2507 if (data_fake.flags & SF_HAS_EVAL)
2508 data->flags |= SF_HAS_EVAL;
2509 data->whilem_c = data_fake.whilem_c;
3dab1dad 2510 }
8aa23a47
YO
2511 if (flags & SCF_DO_STCLASS)
2512 cl_or(pRExC_state, &accum, &this_class);
2513 }
2514 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2515 min1 = 0;
2516 if (flags & SCF_DO_SUBSTR) {
2517 data->pos_min += min1;
2518 data->pos_delta += max1 - min1;
2519 if (max1 != min1 || is_inf)
2520 data->longest = &(data->longest_float);
2521 }
2522 min += min1;
2523 delta += max1 - min1;
2524 if (flags & SCF_DO_STCLASS_OR) {
2525 cl_or(pRExC_state, data->start_class, &accum);
2526 if (min1) {
2527 cl_and(data->start_class, and_withp);
2528 flags &= ~SCF_DO_STCLASS;
653099ff 2529 }
8aa23a47
YO
2530 }
2531 else if (flags & SCF_DO_STCLASS_AND) {
2532 if (min1) {
2533 cl_and(data->start_class, &accum);
2534 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2535 }
8aa23a47
YO
2536 else {
2537 /* Switch to OR mode: cache the old value of
2538 * data->start_class */
2539 INIT_AND_WITHP;
2540 StructCopy(data->start_class, and_withp,
2541 struct regnode_charclass_class);
2542 flags &= ~SCF_DO_STCLASS_AND;
2543 StructCopy(&accum, data->start_class,
2544 struct regnode_charclass_class);
2545 flags |= SCF_DO_STCLASS_OR;
2546 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2547 }
8aa23a47 2548 }
a3621e74 2549
8aa23a47
YO
2550 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2551 /* demq.
a3621e74 2552
8aa23a47
YO
2553 Assuming this was/is a branch we are dealing with: 'scan' now
2554 points at the item that follows the branch sequence, whatever
2555 it is. We now start at the beginning of the sequence and look
2556 for subsequences of
a3621e74 2557
8aa23a47
YO
2558 BRANCH->EXACT=>x1
2559 BRANCH->EXACT=>x2
2560 tail
a3621e74 2561
8aa23a47 2562 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2563
8aa23a47
YO
2564 If we can find such a subseqence we need to turn the first
2565 element into a trie and then add the subsequent branch exact
2566 strings to the trie.
a3621e74 2567
8aa23a47 2568 We have two cases
a3621e74 2569
8aa23a47 2570 1. patterns where the whole set of branch can be converted.
a3621e74 2571
8aa23a47 2572 2. patterns where only a subset can be converted.
a3621e74 2573
8aa23a47
YO
2574 In case 1 we can replace the whole set with a single regop
2575 for the trie. In case 2 we need to keep the start and end
2576 branchs so
a3621e74 2577
8aa23a47
YO
2578 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2579 becomes BRANCH TRIE; BRANCH X;
786e8c11 2580
8aa23a47
YO
2581 There is an additional case, that being where there is a
2582 common prefix, which gets split out into an EXACT like node
2583 preceding the TRIE node.
a3621e74 2584
8aa23a47
YO
2585 If x(1..n)==tail then we can do a simple trie, if not we make
2586 a "jump" trie, such that when we match the appropriate word
2587 we "jump" to the appopriate tail node. Essentailly we turn
2588 a nested if into a case structure of sorts.
b515a41d 2589
8aa23a47
YO
2590 */
2591
2592 int made=0;
2593 if (!re_trie_maxbuff) {
2594 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2595 if (!SvIOK(re_trie_maxbuff))
2596 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2597 }
2598 if ( SvIV(re_trie_maxbuff)>=0 ) {
2599 regnode *cur;
2600 regnode *first = (regnode *)NULL;
2601 regnode *last = (regnode *)NULL;
2602 regnode *tail = scan;
2603 U8 optype = 0;
2604 U32 count=0;
a3621e74
YO
2605
2606#ifdef DEBUGGING
8aa23a47 2607 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2608#endif
8aa23a47
YO
2609 /* var tail is used because there may be a TAIL
2610 regop in the way. Ie, the exacts will point to the
2611 thing following the TAIL, but the last branch will
2612 point at the TAIL. So we advance tail. If we
2613 have nested (?:) we may have to move through several
2614 tails.
2615 */
2616
2617 while ( OP( tail ) == TAIL ) {
2618 /* this is the TAIL generated by (?:) */
2619 tail = regnext( tail );
2620 }
a3621e74 2621
8aa23a47
YO
2622
2623 DEBUG_OPTIMISE_r({
2624 regprop(RExC_rx, mysv, tail );
2625 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2626 (int)depth * 2 + 2, "",
2627 "Looking for TRIE'able sequences. Tail node is: ",
2628 SvPV_nolen_const( mysv )
2629 );
2630 });
2631
2632 /*
2633
2634 step through the branches, cur represents each
2635 branch, noper is the first thing to be matched
2636 as part of that branch and noper_next is the
2637 regnext() of that node. if noper is an EXACT
2638 and noper_next is the same as scan (our current
2639 position in the regex) then the EXACT branch is
2640 a possible optimization target. Once we have
2641 two or more consequetive such branches we can
2642 create a trie of the EXACT's contents and stich
2643 it in place. If the sequence represents all of
2644 the branches we eliminate the whole thing and
2645 replace it with a single TRIE. If it is a
2646 subsequence then we need to stitch it in. This
2647 means the first branch has to remain, and needs
2648 to be repointed at the item on the branch chain
2649 following the last branch optimized. This could
2650 be either a BRANCH, in which case the
2651 subsequence is internal, or it could be the
2652 item following the branch sequence in which
2653 case the subsequence is at the end.
2654
2655 */
2656
2657 /* dont use tail as the end marker for this traverse */
2658 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2659 regnode * const noper = NEXTOPER( cur );
b515a41d 2660#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2661 regnode * const noper_next = regnext( noper );
b515a41d
YO
2662#endif
2663
8aa23a47
YO
2664 DEBUG_OPTIMISE_r({
2665 regprop(RExC_rx, mysv, cur);
2666 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2667 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2668
2669 regprop(RExC_rx, mysv, noper);
2670 PerlIO_printf( Perl_debug_log, " -> %s",
2671 SvPV_nolen_const(mysv));
2672
2673 if ( noper_next ) {
2674 regprop(RExC_rx, mysv, noper_next );
2675 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2676 SvPV_nolen_const(mysv));
2677 }
2678 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2679 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2680 });
2681 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2682 : PL_regkind[ OP( noper ) ] == EXACT )
2683 || OP(noper) == NOTHING )
786e8c11 2684#ifdef NOJUMPTRIE
8aa23a47 2685 && noper_next == tail
786e8c11 2686#endif
8aa23a47
YO
2687 && count < U16_MAX)
2688 {
2689 count++;
2690 if ( !first || optype == NOTHING ) {
2691 if (!first) first = cur;
2692 optype = OP( noper );
2693 } else {
2694 last = cur;
2695 }
2696 } else {
2697 if ( last ) {
2698 make_trie( pRExC_state,
2699 startbranch, first, cur, tail, count,
2700 optype, depth+1 );
2701 }
2702 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2703#ifdef NOJUMPTRIE
8aa23a47 2704 && noper_next == tail
786e8c11 2705#endif
8aa23a47
YO
2706 ){
2707 count = 1;
2708 first = cur;
2709 optype = OP( noper );
2710 } else {
2711 count = 0;
2712 first = NULL;
2713 optype = 0;
2714 }
2715 last = NULL;
2716 }
2717 }
2718 DEBUG_OPTIMISE_r({
2719 regprop(RExC_rx, mysv, cur);
2720 PerlIO_printf( Perl_debug_log,
2721 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2722 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2723
2724 });
2725 if ( last ) {
2726 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2727#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2728 if ( ((made == MADE_EXACT_TRIE &&
2729 startbranch == first)
2730 || ( first_non_open == first )) &&
2731 depth==0 ) {
2732 flags |= SCF_TRIE_RESTUDY;
2733 if ( startbranch == first
2734 && scan == tail )
2735 {
2736 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2737 }
2738 }
3dab1dad 2739#endif
8aa23a47
YO
2740 }
2741 }
2742
2743 } /* do trie */
2744
653099ff 2745 }
8aa23a47
YO
2746 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2747 scan = NEXTOPER(NEXTOPER(scan));
2748 } else /* single branch is optimized. */
2749 scan = NEXTOPER(scan);
2750 continue;
2751 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2752 scan_frame *newframe = NULL;
2753 I32 paren;
2754 regnode *start;
2755 regnode *end;
2756
2757 if (OP(scan) != SUSPEND) {
2758 /* set the pointer */
2759 if (OP(scan) == GOSUB) {
2760 paren = ARG(scan);
2761 RExC_recurse[ARG2L(scan)] = scan;
2762 start = RExC_open_parens[paren-1];
2763 end = RExC_close_parens[paren-1];
2764 } else {
2765 paren = 0;
f8fc2ecf 2766 start = RExC_rxi->program + 1;
8aa23a47
YO
2767 end = RExC_opend;
2768 }
2769 if (!recursed) {
2770 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2771 SAVEFREEPV(recursed);
2772 }
2773 if (!PAREN_TEST(recursed,paren+1)) {
2774 PAREN_SET(recursed,paren+1);
2775 Newx(newframe,1,scan_frame);
2776 } else {
2777 if (flags & SCF_DO_SUBSTR) {
304ee84b 2778 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2779 data->longest = &(data->longest_float);
2780 }
2781 is_inf = is_inf_internal = 1;
2782 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2783 cl_anything(pRExC_state, data->start_class);
2784 flags &= ~SCF_DO_STCLASS;
2785 }
2786 } else {
2787 Newx(newframe,1,scan_frame);
2788 paren = stopparen;
2789 start = scan+2;
2790 end = regnext(scan);
2791 }
2792 if (newframe) {
2793 assert(start);
2794 assert(end);
2795 SAVEFREEPV(newframe);
2796 newframe->next = regnext(scan);
2797 newframe->last = last;
2798 newframe->stop = stopparen;
2799 newframe->prev = frame;
2800
2801 frame = newframe;
2802 scan = start;
2803 stopparen = paren;
2804 last = end;
2805
2806 continue;
2807 }
2808 }
2809 else if (OP(scan) == EXACT) {
2810 I32 l = STR_LEN(scan);
2811 UV uc;
2812 if (UTF) {
2813 const U8 * const s = (U8*)STRING(scan);
2814 l = utf8_length(s, s + l);
2815 uc = utf8_to_uvchr(s, NULL);
2816 } else {
2817 uc = *((U8*)STRING(scan));
2818 }
2819 min += l;
2820 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2821 /* The code below prefers earlier match for fixed
2822 offset, later match for variable offset. */
2823 if (data->last_end == -1) { /* Update the start info. */
2824 data->last_start_min = data->pos_min;
2825 data->last_start_max = is_inf
2826 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2827 }
8aa23a47
YO
2828 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2829 if (UTF)
2830 SvUTF8_on(data->last_found);
2831 {
2832 SV * const sv = data->last_found;
2833 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2834 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2835 if (mg && mg->mg_len >= 0)
2836 mg->mg_len += utf8_length((U8*)STRING(scan),
2837 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2838 }
8aa23a47
YO
2839 data->last_end = data->pos_min + l;
2840 data->pos_min += l; /* As in the first entry. */
2841 data->flags &= ~SF_BEFORE_EOL;
2842 }
2843 if (flags & SCF_DO_STCLASS_AND) {
2844 /* Check whether it is compatible with what we know already! */
2845 int compat = 1;
2846
2847 if (uc >= 0x100 ||
2848 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2849 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2850 && (!(data->start_class->flags & ANYOF_FOLD)
2851 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2852 )
2853 compat = 0;
2854 ANYOF_CLASS_ZERO(data->start_class);
2855 ANYOF_BITMAP_ZERO(data->start_class);
2856 if (compat)
2857 ANYOF_BITMAP_SET(data->start_class, uc);
2858 data->start_class->flags &= ~ANYOF_EOS;
2859 if (uc < 0x100)
2860 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2861 }
2862 else if (flags & SCF_DO_STCLASS_OR) {
2863 /* false positive possible if the class is case-folded */
2864 if (uc < 0x100)
2865 ANYOF_BITMAP_SET(data->start_class, uc);
2866 else
2867 data->start_class->flags |= ANYOF_UNICODE_ALL;
2868 data->start_class->flags &= ~ANYOF_EOS;
2869 cl_and(data->start_class, and_withp);
2870 }
2871 flags &= ~SCF_DO_STCLASS;
2872 }
2873 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2874 I32 l = STR_LEN(scan);
2875 UV uc = *((U8*)STRING(scan));
2876
2877 /* Search for fixed substrings supports EXACT only. */
2878 if (flags & SCF_DO_SUBSTR) {
2879 assert(data);
304ee84b 2880 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2881 }
2882 if (UTF) {
2883 const U8 * const s = (U8 *)STRING(scan);
2884 l = utf8_length(s, s + l);
2885 uc = utf8_to_uvchr(s, NULL);
2886 }
2887 min += l;
2888 if (flags & SCF_DO_SUBSTR)
2889 data->pos_min += l;
2890 if (flags & SCF_DO_STCLASS_AND) {
2891 /* Check whether it is compatible with what we know already! */
2892 int compat = 1;
2893
2894 if (uc >= 0x100 ||
2895 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2896 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2897 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2898 compat = 0;
2899 ANYOF_CLASS_ZERO(data->start_class);
2900 ANYOF_BITMAP_ZERO(data->start_class);
2901 if (compat) {
2902 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2903 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2904 data->start_class->flags |= ANYOF_FOLD;
2905 if (OP(scan) == EXACTFL)
2906 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2907 }
8aa23a47
YO
2908 }
2909 else if (flags & SCF_DO_STCLASS_OR) {
2910 if (data->start_class->flags & ANYOF_FOLD) {
2911 /* false positive possible if the class is case-folded.
2912 Assume that the locale settings are the same... */
1aa99e6b
IH
2913 if (uc < 0x100)
2914 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2915 data->start_class->flags &= ~ANYOF_EOS;
2916 }
8aa23a47 2917 cl_and(data->start_class, and_withp);
653099ff 2918 }
8aa23a47
YO
2919 flags &= ~SCF_DO_STCLASS;
2920 }
2921 else if (strchr((const char*)PL_varies,OP(scan))) {
2922 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2923 I32 f = flags, pos_before = 0;
2924 regnode * const oscan = scan;
2925 struct regnode_charclass_class this_class;
2926 struct regnode_charclass_class *oclass = NULL;
2927 I32 next_is_eval = 0;
2928
2929 switch (PL_regkind[OP(scan)]) {
2930 case WHILEM: /* End of (?:...)* . */
2931 scan = NEXTOPER(scan);
2932 goto finish;
2933 case PLUS:
2934 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2935 next = NEXTOPER(scan);
2936 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2937 mincount = 1;
2938 maxcount = REG_INFTY;
2939 next = regnext(scan);
2940 scan = NEXTOPER(scan);
2941 goto do_curly;
2942 }
2943 }
2944 if (flags & SCF_DO_SUBSTR)
2945 data->pos_min++;
2946 min++;
2947 /* Fall through. */
2948 case STAR:
2949 if (flags & SCF_DO_STCLASS) {
2950 mincount = 0;
2951 maxcount = REG_INFTY;
2952 next = regnext(scan);
2953 scan = NEXTOPER(scan);
2954 goto do_curly;
2955 }
2956 is_inf = is_inf_internal = 1;
2957 scan = regnext(scan);
c277df42 2958 if (flags & SCF_DO_SUBSTR) {
304ee84b 2959 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2960 data->longest = &(data->longest_float);
c277df42 2961 }
8aa23a47
YO
2962 goto optimize_curly_tail;
2963 case CURLY:
2964 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2965 && (scan->flags == stopparen))
2966 {
2967 mincount = 1;
2968 maxcount = 1;
2969 } else {
2970 mincount = ARG1(scan);
2971 maxcount = ARG2(scan);
653099ff 2972 }
8aa23a47
YO
2973 next = regnext(scan);
2974 if (OP(scan) == CURLYX) {
2975 I32 lp = (data ? *(data->last_closep) : 0);
2976 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2977 }
8aa23a47
YO
2978 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2979 next_is_eval = (OP(scan) == EVAL);
2980 do_curly:
2981 if (flags & SCF_DO_SUBSTR) {
304ee84b 2982 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 2983 pos_before = data->pos_min;
b45f050a 2984 }
8aa23a47
YO
2985 if (data) {
2986 fl = data->flags;
2987 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2988 if (is_inf)
2989 data->flags |= SF_IS_INF;
2990 }
2991 if (flags & SCF_DO_STCLASS) {
2992 cl_init(pRExC_state, &this_class);
2993 oclass = data->start_class;
2994 data->start_class = &this_class;
2995 f |= SCF_DO_STCLASS_AND;
2996 f &= ~SCF_DO_STCLASS_OR;
2997 }
2998 /* These are the cases when once a subexpression
2999 fails at a particular position, it cannot succeed
3000 even after backtracking at the enclosing scope.
3001
3002 XXXX what if minimal match and we are at the
3003 initial run of {n,m}? */
3004 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3005 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3006
8aa23a47
YO
3007 /* This will finish on WHILEM, setting scan, or on NULL: */
3008 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3009 last, data, stopparen, recursed, NULL,
3010 (mincount == 0
3011 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3012
8aa23a47
YO
3013 if (flags & SCF_DO_STCLASS)
3014 data->start_class = oclass;
3015 if (mincount == 0 || minnext == 0) {
3016 if (flags & SCF_DO_STCLASS_OR) {
3017 cl_or(pRExC_state, data->start_class, &this_class);
3018 }
3019 else if (flags & SCF_DO_STCLASS_AND) {
3020 /* Switch to OR mode: cache the old value of
3021 * data->start_class */
3022 INIT_AND_WITHP;
3023 StructCopy(data->start_class, and_withp,
3024 struct regnode_charclass_class);
3025 flags &= ~SCF_DO_STCLASS_AND;
3026 StructCopy(&this_class, data->start_class,
3027 struct regnode_charclass_class);
3028 flags |= SCF_DO_STCLASS_OR;
3029 data->start_class->flags |= ANYOF_EOS;
3030 }
3031 } else { /* Non-zero len */
3032 if (flags & SCF_DO_STCLASS_OR) {
3033 cl_or(pRExC_state, data->start_class, &this_class);
3034 cl_and(data->start_class, and_withp);
3035 }
3036 else if (flags & SCF_DO_STCLASS_AND)
3037 cl_and(data->start_class, &this_class);
3038 flags &= ~SCF_DO_STCLASS;
3039 }
3040 if (!scan) /* It was not CURLYX, but CURLY. */
3041 scan = next;
3042 if ( /* ? quantifier ok, except for (?{ ... }) */
3043 (next_is_eval || !(mincount == 0 && maxcount == 1))
3044 && (minnext == 0) && (deltanext == 0)
3045 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3046 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3047 && ckWARN(WARN_REGEXP))
3048 {
3049 vWARN(RExC_parse,
3050 "Quantifier unexpected on zero-length expression");
3051 }
3052
3053 min += minnext * mincount;
3054 is_inf_internal |= ((maxcount == REG_INFTY
3055 && (minnext + deltanext) > 0)
3056 || deltanext == I32_MAX);
3057 is_inf |= is_inf_internal;
3058 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3059
3060 /* Try powerful optimization CURLYX => CURLYN. */
3061 if ( OP(oscan) == CURLYX && data
3062 && data->flags & SF_IN_PAR
3063 && !(data->flags & SF_HAS_EVAL)
3064 && !deltanext && minnext == 1 ) {
3065 /* Try to optimize to CURLYN. */
3066 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3067 regnode * const nxt1 = nxt;
497b47a8 3068#ifdef DEBUGGING
8aa23a47 3069 regnode *nxt2;
497b47a8 3070#endif
c277df42 3071
8aa23a47
YO
3072 /* Skip open. */
3073 nxt = regnext(nxt);
3074 if (!strchr((const char*)PL_simple,OP(nxt))
3075 && !(PL_regkind[OP(nxt)] == EXACT
3076 && STR_LEN(nxt) == 1))
3077 goto nogo;
497b47a8 3078#ifdef DEBUGGING
8aa23a47 3079 nxt2 = nxt;
497b47a8 3080#endif
8aa23a47
YO
3081 nxt = regnext(nxt);
3082 if (OP(nxt) != CLOSE)
3083 goto nogo;
3084 if (RExC_open_parens) {
3085 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3086 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3087 }
3088 /* Now we know that nxt2 is the only contents: */
3089 oscan->flags = (U8)ARG(nxt);
3090 OP(oscan) = CURLYN;
3091 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3092
c277df42 3093#ifdef DEBUGGING
8aa23a47
YO
3094 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3095 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3096 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3097 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3098 OP(nxt + 1) = OPTIMIZED; /* was count. */
3099 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3100#endif
8aa23a47
YO
3101 }
3102 nogo:
3103
3104 /* Try optimization CURLYX => CURLYM. */
3105 if ( OP(oscan) == CURLYX && data
3106 && !(data->flags & SF_HAS_PAR)
3107 && !(data->flags & SF_HAS_EVAL)
3108 && !deltanext /* atom is fixed width */
3109 && minnext != 0 /* CURLYM can't handle zero width */
3110 ) {
3111 /* XXXX How to optimize if data == 0? */
3112 /* Optimize to a simpler form. */
3113 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3114 regnode *nxt2;
3115
3116 OP(oscan) = CURLYM;
3117 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3118 && (OP(nxt2) != WHILEM))
3119 nxt = nxt2;
3120 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3121 /* Need to optimize away parenths. */
3122 if (data->flags & SF_IN_PAR) {
3123 /* Set the parenth number. */
3124 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3125
3126 if (OP(nxt) != CLOSE)
3127 FAIL("Panic opt close");
3128 oscan->flags = (U8)ARG(nxt);
3129 if (RExC_open_parens) {
3130 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3131 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3132 }
8aa23a47
YO
3133 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3135
c277df42 3136#ifdef DEBUGGING
8aa23a47
YO
3137 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3138 OP(nxt + 1) = OPTIMIZED; /* was count. */
3139 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3140 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3141#endif
c277df42 3142#if 0
8aa23a47
YO
3143 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3144 regnode *nnxt = regnext(nxt1);
3145
3146 if (nnxt == nxt) {
3147 if (reg_off_by_arg[OP(nxt1)])
3148 ARG_SET(nxt1, nxt2 - nxt1);
3149 else if (nxt2 - nxt1 < U16_MAX)
3150 NEXT_OFF(nxt1) = nxt2 - nxt1;
3151 else
3152 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3153 }
8aa23a47 3154 nxt1 = nnxt;
c277df42 3155 }
5d1c421c 3156#endif
8aa23a47
YO
3157 /* Optimize again: */
3158 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3159 NULL, stopparen, recursed, NULL, 0,depth+1);
3160 }
3161 else
3162 oscan->flags = 0;
3163 }
3164 else if ((OP(oscan) == CURLYX)
3165 && (flags & SCF_WHILEM_VISITED_POS)
3166 /* See the comment on a similar expression above.
3167 However, this time it not a subexpression
3168 we care about, but the expression itself. */
3169 && (maxcount == REG_INFTY)
3170 && data && ++data->whilem_c < 16) {
3171 /* This stays as CURLYX, we can put the count/of pair. */
3172 /* Find WHILEM (as in regexec.c) */
3173 regnode *nxt = oscan + NEXT_OFF(oscan);
3174
3175 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3176 nxt += ARG(nxt);
3177 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3178 | (RExC_whilem_seen << 4)); /* On WHILEM */
3179 }
3180 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3181 pars++;
3182 if (flags & SCF_DO_SUBSTR) {
3183 SV *last_str = NULL;
3184 int counted = mincount != 0;
a0ed51b3 3185
8aa23a47
YO
3186 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3187#if defined(SPARC64_GCC_WORKAROUND)
3188 I32 b = 0;
3189 STRLEN l = 0;
3190 const char *s = NULL;
3191 I32 old = 0;
b515a41d 3192
8aa23a47
YO
3193 if (pos_before >= data->last_start_min)
3194 b = pos_before;
3195 else
3196 b = data->last_start_min;
b515a41d 3197
8aa23a47
YO
3198 l = 0;
3199 s = SvPV_const(data->last_found, l);
3200 old = b - data->last_start_min;
3201
3202#else
3203 I32 b = pos_before >= data->last_start_min
3204 ? pos_before : data->last_start_min;
3205 STRLEN l;
3206 const char * const s = SvPV_const(data->last_found, l);
3207 I32 old = b - data->last_start_min;
3208#endif
3209
3210 if (UTF)
3211 old = utf8_hop((U8*)s, old) - (U8*)s;
3212
3213 l -= old;
3214 /* Get the added string: */
3215 last_str = newSVpvn(s + old, l);
3216 if (UTF)
3217 SvUTF8_on(last_str);
3218 if (deltanext == 0 && pos_before == b) {
3219 /* What was added is a constant string */
3220 if (mincount > 1) {
3221 SvGROW(last_str, (mincount * l) + 1);
3222 repeatcpy(SvPVX(last_str) + l,
3223 SvPVX_const(last_str), l, mincount - 1);
3224 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3225 /* Add additional parts. */
3226 SvCUR_set(data->last_found,
3227 SvCUR(data->last_found) - l);
3228 sv_catsv(data->last_found, last_str);
3229 {
3230 SV * sv = data->last_found;
3231 MAGIC *mg =
3232 SvUTF8(sv) && SvMAGICAL(sv) ?
3233 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3234 if (mg && mg->mg_len >= 0)
3235 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3236 }
8aa23a47 3237 data->last_end += l * (mincount - 1);
b515a41d 3238 }
8aa23a47
YO
3239 } else {
3240 /* start offset must point into the last copy */
3241 data->last_start_min += minnext * (mincount - 1);
3242 data->last_start_max += is_inf ? I32_MAX
3243 : (maxcount - 1) * (minnext + data->pos_delta);
3244 }
c277df42 3245 }
8aa23a47
YO
3246 /* It is counted once already... */
3247 data->pos_min += minnext * (mincount - counted);
3248 data->pos_delta += - counted * deltanext +
3249 (minnext + deltanext) * maxcount - minnext * mincount;
3250 if (mincount != maxcount) {
3251 /* Cannot extend fixed substrings found inside
3252 the group. */
304ee84b 3253 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3254 if (mincount && last_str) {
3255 SV * const sv = data->last_found;
3256 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3258
3259 if (mg)
3260 mg->mg_len = -1;
3261 sv_setsv(sv, last_str);
3262 data->last_end = data->pos_min;
3263 data->last_start_min =
3264 data->pos_min - CHR_SVLEN(last_str);
3265 data->last_start_max = is_inf
3266 ? I32_MAX
3267 : data->pos_min + data->pos_delta
3268 - CHR_SVLEN(last_str);
3269 }
3270 data->longest = &(data->longest_float);
3271 }
3272 SvREFCNT_dec(last_str);
c277df42 3273 }
8aa23a47
YO
3274 if (data && (fl & SF_HAS_EVAL))
3275 data->flags |= SF_HAS_EVAL;
3276 optimize_curly_tail:
3277 if (OP(oscan) != CURLYX) {
3278 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3279 && NEXT_OFF(next))
3280 NEXT_OFF(oscan) += NEXT_OFF(next);
3281 }
3282 continue;
3283 default: /* REF and CLUMP only? */
3284 if (flags & SCF_DO_SUBSTR) {
304ee84b 3285 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3286 data->longest = &(data->longest_float);
3287 }
3288 is_inf = is_inf_internal = 1;
3289 if (flags & SCF_DO_STCLASS_OR)
3290 cl_anything(pRExC_state, data->start_class);
3291 flags &= ~SCF_DO_STCLASS;
3292 break;
c277df42 3293 }
8aa23a47
YO
3294 }
3295 else if (strchr((const char*)PL_simple,OP(scan))) {
3296 int value = 0;
653099ff 3297
8aa23a47 3298 if (flags & SCF_DO_SUBSTR) {
304ee84b 3299 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3300 data->pos_min++;
3301 }
3302 min++;
3303 if (flags & SCF_DO_STCLASS) {
3304 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3305
8aa23a47
YO
3306 /* Some of the logic below assumes that switching
3307 locale on will only add false positives. */
3308 switch (PL_regkind[OP(scan)]) {
3309 case SANY:
3310 default:
3311 do_default:
3312 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3313 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3314 cl_anything(pRExC_state, data->start_class);
3315 break;
3316 case REG_ANY:
3317 if (OP(scan) == SANY)
3318 goto do_default;
3319 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3320 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3321 || (data->start_class->flags & ANYOF_CLASS));
3322 cl_anything(pRExC_state, data->start_class);
653099ff 3323 }
8aa23a47
YO
3324 if (flags & SCF_DO_STCLASS_AND || !value)
3325 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3326 break;
3327 case ANYOF:
3328 if (flags & SCF_DO_STCLASS_AND)
3329 cl_and(data->start_class,
3330 (struct regnode_charclass_class*)scan);
653099ff 3331 else
8aa23a47
YO
3332 cl_or(pRExC_state, data->start_class,
3333 (struct regnode_charclass_class*)scan);
3334 break;
3335 case ALNUM:
3336 if (flags & SCF_DO_STCLASS_AND) {
3337 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3338 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3339 for (value = 0; value < 256; value++)
3340 if (!isALNUM(value))
3341 ANYOF_BITMAP_CLEAR(data->start_class, value);
3342 }
653099ff 3343 }
8aa23a47
YO
3344 else {
3345 if (data->start_class->flags & ANYOF_LOCALE)
3346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3347 else {
3348 for (value = 0; value < 256; value++)
3349 if (isALNUM(value))
3350 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3351 }
8aa23a47
YO
3352 }
3353 break;
3354 case ALNUML:
3355 if (flags & SCF_DO_STCLASS_AND) {
3356 if (data->start_class->flags & ANYOF_LOCALE)
3357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3358 }
3359 else {
3360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3361 data->start_class->flags |= ANYOF_LOCALE;
3362 }
3363 break;
3364 case NALNUM:
3365 if (flags & SCF_DO_STCLASS_AND) {
3366 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3367 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3368 for (value = 0; value < 256; value++)
3369 if (isALNUM(value))
3370 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3371 }
3372 }
8aa23a47
YO
3373 else {
3374 if (data->start_class->flags & ANYOF_LOCALE)
3375 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3376 else {
3377 for (value = 0; value < 256; value++)
3378 if (!isALNUM(value))
3379 ANYOF_BITMAP_SET(data->start_class, value);
3380 }
653099ff 3381 }
8aa23a47
YO
3382 break;
3383 case NALNUML:
3384 if (flags & SCF_DO_STCLASS_AND) {
3385 if (data->start_class->flags & ANYOF_LOCALE)
3386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3387 }
8aa23a47
YO
3388 else {
3389 data->start_class->flags |= ANYOF_LOCALE;
3390 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3391 }
3392 break;
3393 case SPACE:
3394 if (flags & SCF_DO_STCLASS_AND) {
3395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3397 for (value = 0; value < 256; value++)
3398 if (!isSPACE(value))
3399 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3400 }
3401 }
8aa23a47
YO
3402 else {
3403 if (data->start_class->flags & ANYOF_LOCALE)
3404 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3405 else {
3406 for (value = 0; value < 256; value++)
3407 if (isSPACE(value))
3408 ANYOF_BITMAP_SET(data->start_class, value);
3409 }
653099ff 3410 }
8aa23a47
YO
3411 break;
3412 case SPACEL:
3413 if (flags & SCF_DO_STCLASS_AND) {
3414 if (data->start_class->flags & ANYOF_LOCALE)
3415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3416 }
3417 else {
3418 data->start_class->flags |= ANYOF_LOCALE;
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3420 }
3421 break;
3422 case NSPACE:
3423 if (flags & SCF_DO_STCLASS_AND) {
3424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3426 for (value = 0; value < 256; value++)
3427 if (isSPACE(value))
3428 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3429 }
8aa23a47
YO
3430 }
3431 else {
3432 if (data->start_class->flags & ANYOF_LOCALE)
3433 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3434 else {
3435 for (value = 0; value < 256; value++)
3436 if (!isSPACE(value))
3437 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3438 }
3439 }
8aa23a47
YO
3440 break;
3441 case NSPACEL:
3442 if (flags & SCF_DO_STCLASS_AND) {
3443 if (data->start_class->flags & ANYOF_LOCALE) {
3444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3445 for (value = 0; value < 256; value++)
3446 if (!isSPACE(value))
3447 ANYOF_BITMAP_CLEAR(data->start_class, value);
3448 }
653099ff 3449 }
8aa23a47
YO
3450 else {
3451 data->start_class->flags |= ANYOF_LOCALE;
3452 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3453 }
3454 break;
3455 case DIGIT:
3456 if (flags & SCF_DO_STCLASS_AND) {
3457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3458 for (value = 0; value < 256; value++)
3459 if (!isDIGIT(value))
3460 ANYOF_BITMAP_CLEAR(data->start_class, value);
3461 }
3462 else {
3463 if (data->start_class->flags & ANYOF_LOCALE)
3464 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3465 else {
3466 for (value = 0; value < 256; value++)
3467 if (isDIGIT(value))
3468 ANYOF_BITMAP_SET(data->start_class, value);
3469 }
3470 }
3471 break;
3472 case NDIGIT:
3473 if (flags & SCF_DO_STCLASS_AND) {
3474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3475 for (value = 0; value < 256; value++)
3476 if (isDIGIT(value))
3477 ANYOF_BITMAP_CLEAR(data->start_class, value);
3478 }
3479 else {
3480 if (data->start_class->flags & ANYOF_LOCALE)
3481 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3482 else {
3483 for (value = 0; value < 256; value++)
3484 if (!isDIGIT(value))
3485 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3486 }
3487 }
8aa23a47
YO
3488 break;
3489 }
3490 if (flags & SCF_DO_STCLASS_OR)
3491 cl_and(data->start_class, and_withp);
3492 flags &= ~SCF_DO_STCLASS;
3493 }
3494 }
3495 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3496 data->flags |= (OP(scan) == MEOL
3497 ? SF_BEFORE_MEOL
3498 : SF_BEFORE_SEOL);
3499 }
3500 else if ( PL_regkind[OP(scan)] == BRANCHJ
3501 /* Lookbehind, or need to calculate parens/evals/stclass: */
3502 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3503 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3504 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3505 || OP(scan) == UNLESSM )
3506 {
3507 /* Negative Lookahead/lookbehind
3508 In this case we can't do fixed string optimisation.
3509 */
1de06328 3510
8aa23a47
YO
3511 I32 deltanext, minnext, fake = 0;
3512 regnode *nscan;
3513 struct regnode_charclass_class intrnl;
3514 int f = 0;
1de06328 3515
8aa23a47
YO
3516 data_fake.flags = 0;
3517 if (data) {
3518 data_fake.whilem_c = data->whilem_c;
3519 data_fake.last_closep = data->last_closep;
c277df42 3520 }
8aa23a47
YO
3521 else
3522 data_fake.last_closep = &fake;
58e23c8d 3523 data_fake.pos_delta = delta;
8aa23a47
YO
3524 if ( flags & SCF_DO_STCLASS && !scan->flags
3525 && OP(scan) == IFMATCH ) { /* Lookahead */
3526 cl_init(pRExC_state, &intrnl);
3527 data_fake.start_class = &intrnl;
3528 f |= SCF_DO_STCLASS_AND;
3529 }
3530 if (flags & SCF_WHILEM_VISITED_POS)
3531 f |= SCF_WHILEM_VISITED_POS;
3532 next = regnext(scan);
3533 nscan = NEXTOPER(NEXTOPER(scan));
3534 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3535 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3536 if (scan->flags) {
3537 if (deltanext) {
58e23c8d 3538 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3539 }
3540 else if (minnext > (I32)U8_MAX) {
58e23c8d 3541 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3542 }
3543 scan->flags = (U8)minnext;
3544 }
3545 if (data) {
3546 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3547 pars++;
3548 if (data_fake.flags & SF_HAS_EVAL)
3549 data->flags |= SF_HAS_EVAL;
3550 data->whilem_c = data_fake.whilem_c;
3551 }
3552 if (f & SCF_DO_STCLASS_AND) {
3553 const int was = (data->start_class->flags & ANYOF_EOS);
3554
3555 cl_and(data->start_class, &intrnl);
3556 if (was)
3557 data->start_class->flags |= ANYOF_EOS;
3558 }
cb434fcc 3559 }
8aa23a47
YO
3560#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3561 else {
3562 /* Positive Lookahead/lookbehind
3563 In this case we can do fixed string optimisation,
3564 but we must be careful about it. Note in the case of
3565 lookbehind the positions will be offset by the minimum
3566 length of the pattern, something we won't know about
3567 until after the recurse.
3568 */
3569 I32 deltanext, fake = 0;
3570 regnode *nscan;
3571 struct regnode_charclass_class intrnl;
3572 int f = 0;
3573 /* We use SAVEFREEPV so that when the full compile
3574 is finished perl will clean up the allocated
3575 minlens when its all done. This was we don't
3576 have to worry about freeing them when we know
3577 they wont be used, which would be a pain.
3578 */
3579 I32 *minnextp;
3580 Newx( minnextp, 1, I32 );
3581 SAVEFREEPV(minnextp);
3582
3583 if (data) {
3584 StructCopy(data, &data_fake, scan_data_t);
3585 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3586 f |= SCF_DO_SUBSTR;
3587 if (scan->flags)
304ee84b 3588 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3589 data_fake.last_found=newSVsv(data->last_found);
3590 }
3591 }
3592 else
3593 data_fake.last_closep = &fake;
3594 data_fake.flags = 0;
58e23c8d 3595 data_fake.pos_delta = delta;
8aa23a47
YO
3596 if (is_inf)
3597 data_fake.flags |= SF_IS_INF;
3598 if ( flags & SCF_DO_STCLASS && !scan->flags
3599 && OP(scan) == IFMATCH ) { /* Lookahead */
3600 cl_init(pRExC_state, &intrnl);
3601 data_fake.start_class = &intrnl;
3602 f |= SCF_DO_STCLASS_AND;
3603 }
3604 if (flags & SCF_WHILEM_VISITED_POS)
3605 f |= SCF_WHILEM_VISITED_POS;
3606 next = regnext(scan);
3607 nscan = NEXTOPER(NEXTOPER(scan));
3608
3609 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3610 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3611 if (scan->flags) {
3612 if (deltanext) {
58e23c8d 3613 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3614 }
3615 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3616 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3617 }
3618 scan->flags = (U8)*minnextp;
3619 }
3620
3621 *minnextp += min;
3622
3623 if (f & SCF_DO_STCLASS_AND) {
3624 const int was = (data->start_class->flags & ANYOF_EOS);
3625
3626 cl_and(data->start_class, &intrnl);
3627 if (was)
3628 data->start_class->flags |= ANYOF_EOS;
3629 }
3630 if (data) {
3631 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3632 pars++;
3633 if (data_fake.flags & SF_HAS_EVAL)
3634 data->flags |= SF_HAS_EVAL;
3635 data->whilem_c = data_fake.whilem_c;
3636 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3637 if (RExC_rx->minlen<*minnextp)
3638 RExC_rx->minlen=*minnextp;
304ee84b 3639 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3640 SvREFCNT_dec(data_fake.last_found);
3641
3642 if ( data_fake.minlen_fixed != minlenp )
3643 {
3644 data->offset_fixed= data_fake.offset_fixed;
3645 data->minlen_fixed= data_fake.minlen_fixed;
3646 data->lookbehind_fixed+= scan->flags;
3647 }
3648 if ( data_fake.minlen_float != minlenp )
3649 {
3650 data->minlen_float= data_fake.minlen_float;
3651 data->offset_float_min=data_fake.offset_float_min;
3652 data->offset_float_max=data_fake.offset_float_max;
3653 data->lookbehind_float+= scan->flags;
3654 }
3655 }
3656 }
3657
3658
40d049e4 3659 }
8aa23a47
YO
3660#endif
3661 }
3662 else if (OP(scan) == OPEN) {
3663 if (stopparen != (I32)ARG(scan))
3664 pars++;
3665 }
3666 else if (OP(scan) == CLOSE) {
3667 if (stopparen == (I32)ARG(scan)) {
3668 break;
3669 }
3670 if ((I32)ARG(scan) == is_par) {
3671 next = regnext(scan);
b515a41d 3672
8aa23a47
YO
3673 if ( next && (OP(next) != WHILEM) && next < last)
3674 is_par = 0; /* Disable optimization */
40d049e4 3675 }
8aa23a47
YO
3676 if (data)
3677 *(data->last_closep) = ARG(scan);
3678 }
3679 else if (OP(scan) == EVAL) {
c277df42
IZ
3680 if (data)
3681 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3682 }
3683 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3684 if (flags & SCF_DO_SUBSTR) {
304ee84b 3685 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3686 flags &= ~SCF_DO_SUBSTR;
40d049e4 3687 }
8aa23a47
YO
3688 if (data && OP(scan)==ACCEPT) {
3689 data->flags |= SCF_SEEN_ACCEPT;
3690 if (stopmin > min)
3691 stopmin = min;
e2e6a0f1 3692 }
8aa23a47
YO
3693 }
3694 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3695 {
0f5d15d6 3696 if (flags & SCF_DO_SUBSTR) {
304ee84b 3697 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3698 data->longest = &(data->longest_float);
3699 }
3700 is_inf = is_inf_internal = 1;
653099ff 3701 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3702 cl_anything(pRExC_state, data->start_class);
96776eda 3703 flags &= ~SCF_DO_STCLASS;
8aa23a47 3704 }
58e23c8d 3705 else if (OP(scan) == GPOS) {
bbe252da 3706 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3707 !(delta || is_inf || (data && data->pos_delta)))
3708 {
bbe252da
YO
3709 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3710 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3711 if (RExC_rx->gofs < (U32)min)
3712 RExC_rx->gofs = min;
3713 } else {
bbe252da 3714 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3715 RExC_rx->gofs = 0;
3716 }
3717 }
786e8c11 3718#ifdef TRIE_STUDY_OPT
40d049e4 3719#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3720 else if (PL_regkind[OP(scan)] == TRIE) {
3721 /* NOTE - There is similar code to this block above for handling
3722 BRANCH nodes on the initial study. If you change stuff here
3723 check there too. */
3724 regnode *trie_node= scan;
3725 regnode *tail= regnext(scan);
f8fc2ecf 3726 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3727 I32 max1 = 0, min1 = I32_MAX;
3728 struct regnode_charclass_class accum;
3729
3730 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3731 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3732 if (flags & SCF_DO_STCLASS)
3733 cl_init_zero(pRExC_state, &accum);
3734
3735 if (!trie->jump) {
3736 min1= trie->minlen;
3737 max1= trie->maxlen;
3738 } else {
3739 const regnode *nextbranch= NULL;
3740 U32 word;
3741
3742 for ( word=1 ; word <= trie->wordcount ; word++)
3743 {
3744 I32 deltanext=0, minnext=0, f = 0, fake;
3745 struct regnode_charclass_class this_class;
3746
3747 data_fake.flags = 0;
3748 if (data) {
3749 data_fake.whilem_c = data->whilem_c;
3750 data_fake.last_closep = data->last_closep;
3751 }
3752 else
3753 data_fake.last_closep = &fake;
58e23c8d 3754 data_fake.pos_delta = delta;
8aa23a47
YO
3755 if (flags & SCF_DO_STCLASS) {
3756 cl_init(pRExC_state, &this_class);
3757 data_fake.start_class = &this_class;
3758 f = SCF_DO_STCLASS_AND;
3759 }
3760 if (flags & SCF_WHILEM_VISITED_POS)
3761 f |= SCF_WHILEM_VISITED_POS;
3762
3763 if (trie->jump[word]) {
3764 if (!nextbranch)
3765 nextbranch = trie_node + trie->jump[0];
3766 scan= trie_node + trie->jump[word];
3767 /* We go from the jump point to the branch that follows
3768 it. Note this means we need the vestigal unused branches
3769 even though they arent otherwise used.
3770 */
3771 minnext = study_chunk(pRExC_state, &scan, minlenp,
3772 &deltanext, (regnode *)nextbranch, &data_fake,
3773 stopparen, recursed, NULL, f,depth+1);
3774 }
3775 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3776 nextbranch= regnext((regnode*)nextbranch);
3777
3778 if (min1 > (I32)(minnext + trie->minlen))
3779 min1 = minnext + trie->minlen;
3780 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3781 max1 = minnext + deltanext + trie->maxlen;
3782 if (deltanext == I32_MAX)
3783 is_inf = is_inf_internal = 1;
3784
3785 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3786 pars++;
3787 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3788 if ( stopmin > min + min1)
3789 stopmin = min + min1;
3790 flags &= ~SCF_DO_SUBSTR;
3791 if (data)
3792 data->flags |= SCF_SEEN_ACCEPT;
3793 }
3794 if (data) {
3795 if (data_fake.flags & SF_HAS_EVAL)
3796 data->flags |= SF_HAS_EVAL;
3797 data->whilem_c = data_fake.whilem_c;
3798 }
3799 if (flags & SCF_DO_STCLASS)
3800 cl_or(pRExC_state, &accum, &this_class);
3801 }
3802 }
3803 if (flags & SCF_DO_SUBSTR) {
3804 data->pos_min += min1;
3805 data->pos_delta += max1 - min1;
3806 if (max1 != min1 || is_inf)
3807 data->longest = &(data->longest_float);
3808 }
3809 min += min1;
3810 delta += max1 - min1;
3811 if (flags & SCF_DO_STCLASS_OR) {
3812 cl_or(pRExC_state, data->start_class, &accum);
3813 if (min1) {
3814 cl_and(data->start_class, and_withp);
3815 flags &= ~SCF_DO_STCLASS;
3816 }
3817