This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta: Move the XIDStart change from bugs to incompat.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
4624b182 145 I32 contains_locale;
bb3f3ed2 146 I32 override_recoding;
830247a4
IZ
147#if ADD_TO_REGEXEC
148 char *starttry; /* -Dr: where regtry was called. */
149#define RExC_starttry (pRExC_state->starttry)
150#endif
3dab1dad 151#ifdef DEBUGGING
be8e71aa 152 const char *lastparse;
3dab1dad 153 I32 lastnum;
1f1031fe 154 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
155#define RExC_lastparse (pRExC_state->lastparse)
156#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 157#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 158#endif
830247a4
IZ
159} RExC_state_t;
160
e2509266 161#define RExC_flags (pRExC_state->flags)
830247a4 162#define RExC_precomp (pRExC_state->precomp)
288b8c02 163#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 164#define RExC_rx (pRExC_state->rx)
f8fc2ecf 165#define RExC_rxi (pRExC_state->rxi)
fac92740 166#define RExC_start (pRExC_state->start)
830247a4
IZ
167#define RExC_end (pRExC_state->end)
168#define RExC_parse (pRExC_state->parse)
169#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
170#ifdef RE_TRACK_PATTERN_OFFSETS
171#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
172#endif
830247a4 173#define RExC_emit (pRExC_state->emit)
fac92740 174#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 175#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
176#define RExC_naughty (pRExC_state->naughty)
177#define RExC_sawback (pRExC_state->sawback)
178#define RExC_seen (pRExC_state->seen)
179#define RExC_size (pRExC_state->size)
180#define RExC_npar (pRExC_state->npar)
e2e6a0f1 181#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
182#define RExC_extralen (pRExC_state->extralen)
183#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
184#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 185#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 186#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 187#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
188#define RExC_open_parens (pRExC_state->open_parens)
189#define RExC_close_parens (pRExC_state->close_parens)
190#define RExC_opend (pRExC_state->opend)
81714fb9 191#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
192#define RExC_recurse (pRExC_state->recurse)
193#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 194#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 195#define RExC_contains_locale (pRExC_state->contains_locale)
bb3f3ed2 196#define RExC_override_recoding (pRExC_state->override_recoding)
830247a4 197
cde0cee5 198
a687059c
LW
199#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
200#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
201 ((*s) == '{' && regcurly(s)))
a687059c 202
35c8bce7
LW
203#ifdef SPSTART
204#undef SPSTART /* dratted cpp namespace... */
205#endif
a687059c
LW
206/*
207 * Flags to be passed up and down.
208 */
a687059c 209#define WORST 0 /* Worst case. */
a3b492c3 210#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
211
212/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 213 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 214#define SIMPLE 0x02
a3b492c3
YO
215#define SPSTART 0x04 /* Starts with * or +. */
216#define TRYAGAIN 0x08 /* Weeded out a declaration. */
217#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 218
3dab1dad
YO
219#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
220
07be1b83
YO
221/* whether trie related optimizations are enabled */
222#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
223#define TRIE_STUDY_OPT
786e8c11 224#define FULL_TRIE_STUDY
07be1b83
YO
225#define TRIE_STCLASS
226#endif
1de06328
YO
227
228
40d049e4
YO
229
230#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
231#define PBITVAL(paren) (1 << ((paren) & 7))
232#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
233#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
234#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
235
bbd61b5f
KW
236/* If not already in utf8, do a longjmp back to the beginning */
237#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
238#define REQUIRE_UTF8 STMT_START { \
239 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
240 } STMT_END
40d049e4 241
1de06328
YO
242/* About scan_data_t.
243
244 During optimisation we recurse through the regexp program performing
245 various inplace (keyhole style) optimisations. In addition study_chunk
246 and scan_commit populate this data structure with information about
247 what strings MUST appear in the pattern. We look for the longest
3b753521 248 string that must appear at a fixed location, and we look for the
1de06328
YO
249 longest string that may appear at a floating location. So for instance
250 in the pattern:
251
252 /FOO[xX]A.*B[xX]BAR/
253
254 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
255 strings (because they follow a .* construct). study_chunk will identify
256 both FOO and BAR as being the longest fixed and floating strings respectively.
257
258 The strings can be composites, for instance
259
260 /(f)(o)(o)/
261
262 will result in a composite fixed substring 'foo'.
263
264 For each string some basic information is maintained:
265
266 - offset or min_offset
267 This is the position the string must appear at, or not before.
268 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
269 characters must match before the string we are searching for.
270 Likewise when combined with minlenp and the length of the string it
1de06328
YO
271 tells us how many characters must appear after the string we have
272 found.
273
274 - max_offset
275 Only used for floating strings. This is the rightmost point that
3b753521 276 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
277 string can occur infinitely far to the right.
278
279 - minlenp
280 A pointer to the minimum length of the pattern that the string
281 was found inside. This is important as in the case of positive
282 lookahead or positive lookbehind we can have multiple patterns
283 involved. Consider
284
285 /(?=FOO).*F/
286
287 The minimum length of the pattern overall is 3, the minimum length
288 of the lookahead part is 3, but the minimum length of the part that
289 will actually match is 1. So 'FOO's minimum length is 3, but the
290 minimum length for the F is 1. This is important as the minimum length
291 is used to determine offsets in front of and behind the string being
292 looked for. Since strings can be composites this is the length of the
486ec47a 293 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
294 the length is calculated by study_chunk, so that the minimum lengths
295 are not known until the full pattern has been compiled, thus the
296 pointer to the value.
297
298 - lookbehind
299
300 In the case of lookbehind the string being searched for can be
301 offset past the start point of the final matching string.
302 If this value was just blithely removed from the min_offset it would
303 invalidate some of the calculations for how many chars must match
304 before or after (as they are derived from min_offset and minlen and
305 the length of the string being searched for).
306 When the final pattern is compiled and the data is moved from the
307 scan_data_t structure into the regexp structure the information
308 about lookbehind is factored in, with the information that would
309 have been lost precalculated in the end_shift field for the
310 associated string.
311
312 The fields pos_min and pos_delta are used to store the minimum offset
313 and the delta to the maximum offset at the current point in the pattern.
314
315*/
2c2d71f5
JH
316
317typedef struct scan_data_t {
1de06328
YO
318 /*I32 len_min; unused */
319 /*I32 len_delta; unused */
2c2d71f5
JH
320 I32 pos_min;
321 I32 pos_delta;
322 SV *last_found;
1de06328 323 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
324 I32 last_start_min;
325 I32 last_start_max;
1de06328
YO
326 SV **longest; /* Either &l_fixed, or &l_float. */
327 SV *longest_fixed; /* longest fixed string found in pattern */
328 I32 offset_fixed; /* offset where it starts */
486ec47a 329 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
330 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
331 SV *longest_float; /* longest floating string found in pattern */
332 I32 offset_float_min; /* earliest point in string it can appear */
333 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 334 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 335 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
336 I32 flags;
337 I32 whilem_c;
cb434fcc 338 I32 *last_closep;
653099ff 339 struct regnode_charclass_class *start_class;
2c2d71f5
JH
340} scan_data_t;
341
a687059c 342/*
e50aee73 343 * Forward declarations for pregcomp()'s friends.
a687059c 344 */
a0d0e21e 345
27da23d5 346static const scan_data_t zero_scan_data =
1de06328 347 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
348
349#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
350#define SF_BEFORE_SEOL 0x0001
351#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
352#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
353#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
354
09b7f37c
CB
355#ifdef NO_UNARY_PLUS
356# define SF_FIX_SHIFT_EOL (0+2)
357# define SF_FL_SHIFT_EOL (0+4)
358#else
359# define SF_FIX_SHIFT_EOL (+2)
360# define SF_FL_SHIFT_EOL (+4)
361#endif
c277df42
IZ
362
363#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
364#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
365
366#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
367#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
368#define SF_IS_INF 0x0040
369#define SF_HAS_PAR 0x0080
370#define SF_IN_PAR 0x0100
371#define SF_HAS_EVAL 0x0200
372#define SCF_DO_SUBSTR 0x0400
653099ff
GS
373#define SCF_DO_STCLASS_AND 0x0800
374#define SCF_DO_STCLASS_OR 0x1000
375#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 376#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 377
786e8c11 378#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 379#define SCF_SEEN_ACCEPT 0x8000
07be1b83 380
43fead97 381#define UTF cBOOL(RExC_utf8)
a62b1201
KW
382#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
383#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
384#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
385#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
386#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
387#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
388#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 389
43fead97 390#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 391
ffc61ed2 392#define OOB_UNICODE 12345678
93733859 393#define OOB_NAMEDCLASS -1
b8c5462f 394
a0ed51b3
LW
395#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
396#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
397
8615cb43 398
b45f050a
JF
399/* length of regex to show in messages that don't mark a position within */
400#define RegexLengthToShowInErrorMessages 127
401
402/*
403 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
404 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
405 * op/pragma/warn/regcomp.
406 */
7253e4e3
RK
407#define MARKER1 "<-- HERE" /* marker as it appears in the description */
408#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 409
7253e4e3 410#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
411
412/*
413 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
414 * arg. Show regex, up to a maximum length. If it's too long, chop and add
415 * "...".
416 */
58e23c8d 417#define _FAIL(code) STMT_START { \
bfed75c6 418 const char *ellipses = ""; \
ccb2c380
MP
419 IV len = RExC_end - RExC_precomp; \
420 \
421 if (!SIZE_ONLY) \
288b8c02 422 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
423 if (len > RegexLengthToShowInErrorMessages) { \
424 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
425 len = RegexLengthToShowInErrorMessages - 10; \
426 ellipses = "..."; \
427 } \
58e23c8d 428 code; \
ccb2c380 429} STMT_END
8615cb43 430
58e23c8d
YO
431#define FAIL(msg) _FAIL( \
432 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
433 msg, (int)len, RExC_precomp, ellipses))
434
435#define FAIL2(msg,arg) _FAIL( \
436 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
437 arg, (int)len, RExC_precomp, ellipses))
438
b45f050a 439/*
b45f050a
JF
440 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
441 */
ccb2c380 442#define Simple_vFAIL(m) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
445 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a
JF
447
448/*
449 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
450 */
ccb2c380
MP
451#define vFAIL(m) STMT_START { \
452 if (!SIZE_ONLY) \
288b8c02 453 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
454 Simple_vFAIL(m); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Like Simple_vFAIL(), but accepts two arguments.
459 */
ccb2c380 460#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 461 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
462 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
463 (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
b45f050a
JF
465
466/*
467 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
468 */
ccb2c380
MP
469#define vFAIL2(m,a1) STMT_START { \
470 if (!SIZE_ONLY) \
288b8c02 471 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
472 Simple_vFAIL2(m, a1); \
473} STMT_END
b45f050a
JF
474
475
476/*
477 * Like Simple_vFAIL(), but accepts three arguments.
478 */
ccb2c380 479#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 480 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
481 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
482 (int)offset, RExC_precomp, RExC_precomp + offset); \
483} STMT_END
b45f050a
JF
484
485/*
486 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
487 */
ccb2c380
MP
488#define vFAIL3(m,a1,a2) STMT_START { \
489 if (!SIZE_ONLY) \
288b8c02 490 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
491 Simple_vFAIL3(m, a1, a2); \
492} STMT_END
b45f050a
JF
493
494/*
495 * Like Simple_vFAIL(), but accepts four arguments.
496 */
ccb2c380 497#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 498 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
499 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
500 (int)offset, RExC_precomp, RExC_precomp + offset); \
501} STMT_END
b45f050a 502
668c081a 503#define ckWARNreg(loc,m) STMT_START { \
a28509cc 504 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
505 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
506 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
507} STMT_END
508
668c081a 509#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 510 const IV offset = loc - RExC_precomp; \
d1d15184 511 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
512 m REPORT_LOCATION, \
513 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
514} STMT_END
515
2335b3d3
KW
516#define ckWARN2regdep(loc,m, a1) STMT_START { \
517 const IV offset = loc - RExC_precomp; \
518 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
519 m REPORT_LOCATION, \
520 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
521} STMT_END
522
668c081a 523#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 524 const IV offset = loc - RExC_precomp; \
668c081a 525 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
526 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
527} STMT_END
528
529#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 530 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
531 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
532 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
533} STMT_END
534
668c081a
NC
535#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
536 const IV offset = loc - RExC_precomp; \
537 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
538 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
539} STMT_END
540
ccb2c380 541#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 542 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
543 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
544 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
545} STMT_END
546
668c081a
NC
547#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
548 const IV offset = loc - RExC_precomp; \
549 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
550 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
551} STMT_END
552
ccb2c380 553#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 554 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
555 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
557} STMT_END
9d1d55b5 558
8615cb43 559
cd439c50 560/* Allow for side effects in s */
ccb2c380
MP
561#define REGC(c,s) STMT_START { \
562 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
563} STMT_END
cd439c50 564
fac92740
MJD
565/* Macros for recording node offsets. 20001227 mjd@plover.com
566 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
567 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
568 * Element 0 holds the number n.
07be1b83 569 * Position is 1 indexed.
fac92740 570 */
7122b237
YO
571#ifndef RE_TRACK_PATTERN_OFFSETS
572#define Set_Node_Offset_To_R(node,byte)
573#define Set_Node_Offset(node,byte)
574#define Set_Cur_Node_Offset
575#define Set_Node_Length_To_R(node,len)
576#define Set_Node_Length(node,len)
577#define Set_Node_Cur_Length(node)
578#define Node_Offset(n)
579#define Node_Length(n)
580#define Set_Node_Offset_Length(node,offset,len)
581#define ProgLen(ri) ri->u.proglen
582#define SetProgLen(ri,x) ri->u.proglen = x
583#else
584#define ProgLen(ri) ri->u.offsets[0]
585#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
586#define Set_Node_Offset_To_R(node,byte) STMT_START { \
587 if (! SIZE_ONLY) { \
588 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 589 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 590 if((node) < 0) { \
551405c4 591 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
592 } else { \
593 RExC_offsets[2*(node)-1] = (byte); \
594 } \
595 } \
596} STMT_END
597
598#define Set_Node_Offset(node,byte) \
599 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
600#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
601
602#define Set_Node_Length_To_R(node,len) STMT_START { \
603 if (! SIZE_ONLY) { \
604 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 605 __LINE__, (int)(node), (int)(len))); \
ccb2c380 606 if((node) < 0) { \
551405c4 607 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
608 } else { \
609 RExC_offsets[2*(node)] = (len); \
610 } \
611 } \
612} STMT_END
613
614#define Set_Node_Length(node,len) \
615 Set_Node_Length_To_R((node)-RExC_emit_start, len)
616#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
617#define Set_Node_Cur_Length(node) \
618 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
619
620/* Get offsets and lengths */
621#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
622#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
623
07be1b83
YO
624#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
625 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
626 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
627} STMT_END
7122b237 628#endif
07be1b83
YO
629
630#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
631#define EXPERIMENTAL_INPLACESCAN
f427392e 632#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 633
304ee84b
YO
634#define DEBUG_STUDYDATA(str,data,depth) \
635DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 636 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
637 "%*s" str "Pos:%"IVdf"/%"IVdf \
638 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
639 (int)(depth)*2, "", \
640 (IV)((data)->pos_min), \
641 (IV)((data)->pos_delta), \
304ee84b 642 (UV)((data)->flags), \
1de06328 643 (IV)((data)->whilem_c), \
304ee84b
YO
644 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
645 is_inf ? "INF " : "" \
1de06328
YO
646 ); \
647 if ((data)->last_found) \
648 PerlIO_printf(Perl_debug_log, \
649 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
650 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
651 SvPVX_const((data)->last_found), \
652 (IV)((data)->last_end), \
653 (IV)((data)->last_start_min), \
654 (IV)((data)->last_start_max), \
655 ((data)->longest && \
656 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
657 SvPVX_const((data)->longest_fixed), \
658 (IV)((data)->offset_fixed), \
659 ((data)->longest && \
660 (data)->longest==&((data)->longest_float)) ? "*" : "", \
661 SvPVX_const((data)->longest_float), \
662 (IV)((data)->offset_float_min), \
663 (IV)((data)->offset_float_max) \
664 ); \
665 PerlIO_printf(Perl_debug_log,"\n"); \
666});
667
acfe0abc 668static void clear_re(pTHX_ void *r);
4327152a 669
653099ff 670/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 671 Update the longest found anchored substring and the longest found
653099ff
GS
672 floating substrings if needed. */
673
4327152a 674STATIC void
304ee84b 675S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 676{
e1ec3a88
AL
677 const STRLEN l = CHR_SVLEN(data->last_found);
678 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 679 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 680
7918f24d
NC
681 PERL_ARGS_ASSERT_SCAN_COMMIT;
682
c277df42 683 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 684 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
685 if (*data->longest == data->longest_fixed) {
686 data->offset_fixed = l ? data->last_start_min : data->pos_min;
687 if (data->flags & SF_BEFORE_EOL)
b81d288d 688 data->flags
c277df42
IZ
689 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
690 else
691 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
692 data->minlen_fixed=minlenp;
693 data->lookbehind_fixed=0;
a0ed51b3 694 }
304ee84b 695 else { /* *data->longest == data->longest_float */
c277df42 696 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
697 data->offset_float_max = (l
698 ? data->last_start_max
c277df42 699 : data->pos_min + data->pos_delta);
304ee84b 700 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 701 data->offset_float_max = I32_MAX;
c277df42 702 if (data->flags & SF_BEFORE_EOL)
b81d288d 703 data->flags
c277df42
IZ
704 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
705 else
706 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
707 data->minlen_float=minlenp;
708 data->lookbehind_float=0;
c277df42
IZ
709 }
710 }
711 SvCUR_set(data->last_found, 0);
0eda9292 712 {
a28509cc 713 SV * const sv = data->last_found;
097eb12c
AL
714 if (SvUTF8(sv) && SvMAGICAL(sv)) {
715 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
716 if (mg)
717 mg->mg_len = 0;
718 }
0eda9292 719 }
c277df42
IZ
720 data->last_end = -1;
721 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 722 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
723}
724
653099ff
GS
725/* Can match anything (initialization) */
726STATIC void
3fffb88a 727S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 728{
7918f24d
NC
729 PERL_ARGS_ASSERT_CL_ANYTHING;
730
f8bef550 731 ANYOF_BITMAP_SETALL(cl);
dd58aee1 732 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
05dd4d39
KW
733 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
734 /* Even though no bitmap is in use here, we need to set
735 * the flag below so an AND with a node that does have one
736 * doesn't lose that one. The flag should get cleared if
737 * the other one doesn't; and the code in regexec.c is
738 * structured so this being set when not needed does no
739 * harm. It seemed a little cleaner to set it here than do
740 * a special case in cl_and() */
741 |ANYOF_NONBITMAP_NON_UTF8;
3fffb88a
KW
742
743 /* If any portion of the regex is to operate under locale rules,
744 * initialization includes it. The reason this isn't done for all regexes
745 * is that the optimizer was written under the assumption that locale was
746 * all-or-nothing. Given the complexity and lack of documentation in the
747 * optimizer, and that there are inadequate test cases for locale, so many
748 * parts of it may not work properly, it is safest to avoid locale unless
749 * necessary. */
750 if (RExC_contains_locale) {
9d7a1e63 751 ANYOF_CLASS_SETALL(cl); /* /l uses class */
3fffb88a
KW
752 cl->flags |= ANYOF_LOCALE;
753 }
9d7a1e63
KW
754 else {
755 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
756 }
653099ff
GS
757}
758
759/* Can match anything (initialization) */
760STATIC int
5f66b61c 761S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
762{
763 int value;
764
7918f24d
NC
765 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
766
aaa51d5e 767 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
768 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
769 return 1;
1aa99e6b
IH
770 if (!(cl->flags & ANYOF_UNICODE_ALL))
771 return 0;
10edeb5d 772 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 773 return 0;
653099ff
GS
774 return 1;
775}
776
777/* Can match anything (initialization) */
778STATIC void
e755fd73 779S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 780{
7918f24d
NC
781 PERL_ARGS_ASSERT_CL_INIT;
782
8ecf7187 783 Zero(cl, 1, struct regnode_charclass_class);
653099ff 784 cl->type = ANYOF;
3fffb88a 785 cl_anything(pRExC_state, cl);
1411dba4 786 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
653099ff
GS
787}
788
1051e1c4
KW
789/* These two functions currently do the exact same thing */
790#define cl_init_zero S_cl_init
653099ff 791
dd58aee1
KW
792/* 'AND' a given class with another one. Can create false positives. 'cl'
793 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
794 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 795STATIC void
5f66b61c 796S_cl_and(struct regnode_charclass_class *cl,
a28509cc 797 const struct regnode_charclass_class *and_with)
653099ff 798{
7918f24d 799 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
800
801 assert(and_with->type == ANYOF);
1e6ade67 802
c6b76537 803 /* I (khw) am not sure all these restrictions are necessary XXX */
1e6ade67
KW
804 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
805 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 806 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
807 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
808 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
809 int i;
810
811 if (and_with->flags & ANYOF_INVERT)
812 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
813 cl->bitmap[i] &= ~and_with->bitmap[i];
814 else
815 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
816 cl->bitmap[i] &= and_with->bitmap[i];
817 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 818
c6b76537 819 if (and_with->flags & ANYOF_INVERT) {
8951c461 820
c6b76537
KW
821 /* Here, the and'ed node is inverted. Get the AND of the flags that
822 * aren't affected by the inversion. Those that are affected are
823 * handled individually below */
824 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
825 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
826 cl->flags |= affected_flags;
827
828 /* We currently don't know how to deal with things that aren't in the
829 * bitmap, but we know that the intersection is no greater than what
830 * is already in cl, so let there be false positives that get sorted
831 * out after the synthetic start class succeeds, and the node is
832 * matched for real. */
833
834 /* The inversion of these two flags indicate that the resulting
835 * intersection doesn't have them */
836 if (and_with->flags & ANYOF_UNICODE_ALL) {
4713bfe1
KW
837 cl->flags &= ~ANYOF_UNICODE_ALL;
838 }
c6b76537
KW
839 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
840 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 841 }
1aa99e6b 842 }
c6b76537 843 else { /* and'd node is not inverted */
137165a6 844 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
845
846 /* Here 'and_with' doesn't match anything outside the bitmap
847 * (except possibly ANYOF_UNICODE_ALL), which means the
848 * intersection can't either, except for ANYOF_UNICODE_ALL, in
849 * which case we don't know what the intersection is, but it's no
850 * greater than what cl already has, so can just leave it alone,
851 * with possible false positives */
852 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
853 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
871d0d1a 854 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 855 }
137165a6 856 }
c6b76537
KW
857 else if (! ANYOF_NONBITMAP(cl)) {
858
859 /* Here, 'and_with' does match something outside the bitmap, and cl
860 * doesn't have a list of things to match outside the bitmap. If
861 * cl can match all code points above 255, the intersection will
862 * be those above-255 code points that 'and_with' matches. There
863 * may be false positives from code points in 'and_with' that are
864 * outside the bitmap but below 256, but those get sorted out
865 * after the synthetic start class succeeds). If cl can't match
866 * all Unicode code points, it means here that it can't match *
867 * anything outside the bitmap, so we leave the bitmap empty */
868 if (cl->flags & ANYOF_UNICODE_ALL) {
869 ARG_SET(cl, ARG(and_with));
870 }
871 }
872 else {
873 /* Here, both 'and_with' and cl match something outside the
874 * bitmap. Currently we do not do the intersection, so just match
875 * whatever cl had at the beginning. */
876 }
877
878
879 /* Take the intersection of the two sets of flags */
880 cl->flags &= and_with->flags;
137165a6 881 }
653099ff
GS
882}
883
dd58aee1
KW
884/* 'OR' a given class with another one. Can create false positives. 'cl'
885 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
886 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 887STATIC void
3fffb88a 888S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 889{
7918f24d
NC
890 PERL_ARGS_ASSERT_CL_OR;
891
653099ff 892 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
893
894 /* Here, the or'd node is to be inverted. This means we take the
895 * complement of everything not in the bitmap, but currently we don't
896 * know what that is, so give up and match anything */
897 if (ANYOF_NONBITMAP(or_with)) {
3fffb88a 898 cl_anything(pRExC_state, cl);
c6b76537 899 }
653099ff
GS
900 /* We do not use
901 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
902 * <= (B1 | !B2) | (CL1 | !CL2)
903 * which is wasteful if CL2 is small, but we ignore CL2:
904 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
905 * XXXX Can we handle case-fold? Unclear:
906 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
907 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
908 */
c6b76537 909 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
910 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
911 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
912 int i;
913
914 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
915 cl->bitmap[i] |= ~or_with->bitmap[i];
916 } /* XXXX: logic is complicated otherwise */
917 else {
3fffb88a 918 cl_anything(pRExC_state, cl);
653099ff 919 }
c6b76537
KW
920
921 /* And, we can just take the union of the flags that aren't affected
922 * by the inversion */
923 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
924
925 /* For the remaining flags:
926 ANYOF_UNICODE_ALL and inverted means to not match anything above
927 255, which means that the union with cl should just be
928 what cl has in it, so can ignore this flag
929 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
930 is 127-255 to match them, but then invert that, so the
931 union with cl should just be what cl has in it, so can
932 ignore this flag
933 */
934 } else { /* 'or_with' is not inverted */
653099ff
GS
935 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
936 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
937 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
938 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
939 int i;
940
941 /* OR char bitmap and class bitmap separately */
942 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
943 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 944 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
945 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
946 cl->classflags[i] |= or_with->classflags[i];
947 cl->flags |= ANYOF_CLASS;
948 }
949 }
950 else { /* XXXX: logic is complicated, leave it along for a moment. */
3fffb88a 951 cl_anything(pRExC_state, cl);
653099ff 952 }
9826f543 953
c6b76537
KW
954 if (ANYOF_NONBITMAP(or_with)) {
955
956 /* Use the added node's outside-the-bit-map match if there isn't a
957 * conflict. If there is a conflict (both nodes match something
958 * outside the bitmap, but what they match outside is not the same
959 * pointer, and hence not easily compared until XXX we extend
960 * inversion lists this far), give up and allow the start class to
d94b1d13
KW
961 * match everything outside the bitmap. If that stuff is all above
962 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
c6b76537
KW
963 if (! ANYOF_NONBITMAP(cl)) {
964 ARG_SET(cl, ARG(or_with));
965 }
966 else if (ARG(cl) != ARG(or_with)) {
d94b1d13
KW
967
968 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
969 cl_anything(pRExC_state, cl);
970 }
971 else {
972 cl->flags |= ANYOF_UNICODE_ALL;
973 }
c6b76537 974 }
4c34a693 975 }
0b9668ee
KW
976
977 /* Take the union */
978 cl->flags |= or_with->flags;
1aa99e6b 979 }
653099ff
GS
980}
981
a3621e74
YO
982#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
983#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
984#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
985#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
986
3dab1dad
YO
987
988#ifdef DEBUGGING
07be1b83 989/*
2b8b4781
NC
990 dump_trie(trie,widecharmap,revcharmap)
991 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
992 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
993
994 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
995 The _interim_ variants are used for debugging the interim
996 tables that are used to generate the final compressed
997 representation which is what dump_trie expects.
998
486ec47a 999 Part of the reason for their existence is to provide a form
3dab1dad 1000 of documentation as to how the different representations function.
07be1b83
YO
1001
1002*/
3dab1dad
YO
1003
1004/*
3dab1dad
YO
1005 Dumps the final compressed table form of the trie to Perl_debug_log.
1006 Used for debugging make_trie().
1007*/
b9a59e08 1008
3dab1dad 1009STATIC void
2b8b4781
NC
1010S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1011 AV *revcharmap, U32 depth)
3dab1dad
YO
1012{
1013 U32 state;
ab3bbdeb 1014 SV *sv=sv_newmortal();
55eed653 1015 int colwidth= widecharmap ? 6 : 4;
2e64971a 1016 U16 word;
3dab1dad
YO
1017 GET_RE_DEBUG_FLAGS_DECL;
1018
7918f24d 1019 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1020
3dab1dad
YO
1021 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1022 (int)depth * 2 + 2,"",
1023 "Match","Base","Ofs" );
1024
1025 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1026 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1027 if ( tmp ) {
ab3bbdeb
YO
1028 PerlIO_printf( Perl_debug_log, "%*s",
1029 colwidth,
ddc5bc0f 1030 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1031 PL_colors[0], PL_colors[1],
1032 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1033 PERL_PV_ESCAPE_FIRSTCHAR
1034 )
1035 );
3dab1dad
YO
1036 }
1037 }
1038 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1039 (int)depth * 2 + 2,"");
1040
1041 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1042 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1043 PerlIO_printf( Perl_debug_log, "\n");
1044
1e2e3d02 1045 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1046 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1047
1048 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1049
1050 if ( trie->states[ state ].wordnum ) {
1051 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1052 } else {
1053 PerlIO_printf( Perl_debug_log, "%6s", "" );
1054 }
1055
1056 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1057
1058 if ( base ) {
1059 U32 ofs = 0;
1060
1061 while( ( base + ofs < trie->uniquecharcount ) ||
1062 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1063 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1064 ofs++;
1065
1066 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1067
1068 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1069 if ( ( base + ofs >= trie->uniquecharcount ) &&
1070 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1071 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1072 {
ab3bbdeb
YO
1073 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1074 colwidth,
3dab1dad
YO
1075 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1076 } else {
ab3bbdeb 1077 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1078 }
1079 }
1080
1081 PerlIO_printf( Perl_debug_log, "]");
1082
1083 }
1084 PerlIO_printf( Perl_debug_log, "\n" );
1085 }
2e64971a
DM
1086 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1087 for (word=1; word <= trie->wordcount; word++) {
1088 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1089 (int)word, (int)(trie->wordinfo[word].prev),
1090 (int)(trie->wordinfo[word].len));
1091 }
1092 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1093}
1094/*
3dab1dad
YO
1095 Dumps a fully constructed but uncompressed trie in list form.
1096 List tries normally only are used for construction when the number of
1097 possible chars (trie->uniquecharcount) is very high.
1098 Used for debugging make_trie().
1099*/
1100STATIC void
55eed653 1101S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1102 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1103 U32 depth)
3dab1dad
YO
1104{
1105 U32 state;
ab3bbdeb 1106 SV *sv=sv_newmortal();
55eed653 1107 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1108 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1109
1110 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1111
3dab1dad 1112 /* print out the table precompression. */
ab3bbdeb
YO
1113 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1114 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1115 "------:-----+-----------------\n" );
3dab1dad
YO
1116
1117 for( state=1 ; state < next_alloc ; state ++ ) {
1118 U16 charid;
1119
ab3bbdeb 1120 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1121 (int)depth * 2 + 2,"", (UV)state );
1122 if ( ! trie->states[ state ].wordnum ) {
1123 PerlIO_printf( Perl_debug_log, "%5s| ","");
1124 } else {
1125 PerlIO_printf( Perl_debug_log, "W%4x| ",
1126 trie->states[ state ].wordnum
1127 );
1128 }
1129 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1130 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1131 if ( tmp ) {
1132 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1133 colwidth,
ddc5bc0f 1134 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1135 PL_colors[0], PL_colors[1],
1136 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1137 PERL_PV_ESCAPE_FIRSTCHAR
1138 ) ,
1e2e3d02
YO
1139 TRIE_LIST_ITEM(state,charid).forid,
1140 (UV)TRIE_LIST_ITEM(state,charid).newstate
1141 );
1142 if (!(charid % 10))
664e119d
RGS
1143 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1144 (int)((depth * 2) + 14), "");
1e2e3d02 1145 }
ab3bbdeb
YO
1146 }
1147 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1148 }
1149}
1150
1151/*
3dab1dad
YO
1152 Dumps a fully constructed but uncompressed trie in table form.
1153 This is the normal DFA style state transition table, with a few
1154 twists to facilitate compression later.
1155 Used for debugging make_trie().
1156*/
1157STATIC void
55eed653 1158S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1159 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1160 U32 depth)
3dab1dad
YO
1161{
1162 U32 state;
1163 U16 charid;
ab3bbdeb 1164 SV *sv=sv_newmortal();
55eed653 1165 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1166 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1167
1168 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1169
1170 /*
1171 print out the table precompression so that we can do a visual check
1172 that they are identical.
1173 */
1174
1175 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1176
1177 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1178 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1179 if ( tmp ) {
ab3bbdeb
YO
1180 PerlIO_printf( Perl_debug_log, "%*s",
1181 colwidth,
ddc5bc0f 1182 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1183 PL_colors[0], PL_colors[1],
1184 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1185 PERL_PV_ESCAPE_FIRSTCHAR
1186 )
1187 );
3dab1dad
YO
1188 }
1189 }
1190
1191 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1192
1193 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1194 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1195 }
1196
1197 PerlIO_printf( Perl_debug_log, "\n" );
1198
1199 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1200
1201 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1202 (int)depth * 2 + 2,"",
1203 (UV)TRIE_NODENUM( state ) );
1204
1205 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1206 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1207 if (v)
1208 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1209 else
1210 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1211 }
1212 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1213 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1214 } else {
1215 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1216 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1217 }
1218 }
07be1b83 1219}
3dab1dad
YO
1220
1221#endif
1222
2e64971a 1223
786e8c11
YO
1224/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1225 startbranch: the first branch in the whole branch sequence
1226 first : start branch of sequence of branch-exact nodes.
1227 May be the same as startbranch
1228 last : Thing following the last branch.
1229 May be the same as tail.
1230 tail : item following the branch sequence
1231 count : words in the sequence
1232 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1233 depth : indent depth
3dab1dad 1234
786e8c11 1235Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1236
786e8c11
YO
1237A trie is an N'ary tree where the branches are determined by digital
1238decomposition of the key. IE, at the root node you look up the 1st character and
1239follow that branch repeat until you find the end of the branches. Nodes can be
1240marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1241
786e8c11 1242 /he|she|his|hers/
72f13be8 1243
786e8c11
YO
1244would convert into the following structure. Numbers represent states, letters
1245following numbers represent valid transitions on the letter from that state, if
1246the number is in square brackets it represents an accepting state, otherwise it
1247will be in parenthesis.
07be1b83 1248
786e8c11
YO
1249 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1250 | |
1251 | (2)
1252 | |
1253 (1) +-i->(6)-+-s->[7]
1254 |
1255 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1256
786e8c11
YO
1257 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1258
1259This shows that when matching against the string 'hers' we will begin at state 1
1260read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1261then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1262is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1263single traverse. We store a mapping from accepting to state to which word was
1264matched, and then when we have multiple possibilities we try to complete the
1265rest of the regex in the order in which they occured in the alternation.
1266
1267The only prior NFA like behaviour that would be changed by the TRIE support is
1268the silent ignoring of duplicate alternations which are of the form:
1269
1270 / (DUPE|DUPE) X? (?{ ... }) Y /x
1271
4b714af6 1272Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1273and without the optimisation. With the optimisations dupes will be silently
486ec47a 1274ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1275the following demonstrates:
1276
1277 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1278
1279which prints out 'word' three times, but
1280
1281 'words'=~/(word|word|word)(?{ print $1 })S/
1282
1283which doesnt print it out at all. This is due to other optimisations kicking in.
1284
1285Example of what happens on a structural level:
1286
486ec47a 1287The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1288
1289 1: CURLYM[1] {1,32767}(18)
1290 5: BRANCH(8)
1291 6: EXACT <ac>(16)
1292 8: BRANCH(11)
1293 9: EXACT <ad>(16)
1294 11: BRANCH(14)
1295 12: EXACT <ab>(16)
1296 16: SUCCEED(0)
1297 17: NOTHING(18)
1298 18: END(0)
1299
1300This would be optimizable with startbranch=5, first=5, last=16, tail=16
1301and should turn into:
1302
1303 1: CURLYM[1] {1,32767}(18)
1304 5: TRIE(16)
1305 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1306 <ac>
1307 <ad>
1308 <ab>
1309 16: SUCCEED(0)
1310 17: NOTHING(18)
1311 18: END(0)
1312
1313Cases where tail != last would be like /(?foo|bar)baz/:
1314
1315 1: BRANCH(4)
1316 2: EXACT <foo>(8)
1317 4: BRANCH(7)
1318 5: EXACT <bar>(8)
1319 7: TAIL(8)
1320 8: EXACT <baz>(10)
1321 10: END(0)
1322
1323which would be optimizable with startbranch=1, first=1, last=7, tail=8
1324and would end up looking like:
1325
1326 1: TRIE(8)
1327 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1328 <foo>
1329 <bar>
1330 7: TAIL(8)
1331 8: EXACT <baz>(10)
1332 10: END(0)
1333
1334 d = uvuni_to_utf8_flags(d, uv, 0);
1335
1336is the recommended Unicode-aware way of saying
1337
1338 *(d++) = uv;
1339*/
1340
1e2e3d02 1341#define TRIE_STORE_REVCHAR \
786e8c11 1342 STMT_START { \
73031816
NC
1343 if (UTF) { \
1344 SV *zlopp = newSV(2); \
88c9ea1e
CB
1345 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1346 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1347 SvCUR_set(zlopp, kapow - flrbbbbb); \
1348 SvPOK_on(zlopp); \
1349 SvUTF8_on(zlopp); \
1350 av_push(revcharmap, zlopp); \
1351 } else { \
6bdeddd2 1352 char ooooff = (char)uvc; \
73031816
NC
1353 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1354 } \
1355 } STMT_END
786e8c11
YO
1356
1357#define TRIE_READ_CHAR STMT_START { \
1358 wordlen++; \
1359 if ( UTF ) { \
1360 if ( folder ) { \
1361 if ( foldlen > 0 ) { \
1362 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1363 foldlen -= len; \
1364 scan += len; \
1365 len = 0; \
1366 } else { \
1367 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1368 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1369 foldlen -= UNISKIP( uvc ); \
1370 scan = foldbuf + UNISKIP( uvc ); \
1371 } \
1372 } else { \
1373 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1374 } \
1375 } else { \
1376 uvc = (U32)*uc; \
1377 len = 1; \
1378 } \
1379} STMT_END
1380
1381
1382
1383#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1384 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1385 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1386 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1387 } \
1388 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1389 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1390 TRIE_LIST_CUR( state )++; \
1391} STMT_END
07be1b83 1392
786e8c11
YO
1393#define TRIE_LIST_NEW(state) STMT_START { \
1394 Newxz( trie->states[ state ].trans.list, \
1395 4, reg_trie_trans_le ); \
1396 TRIE_LIST_CUR( state ) = 1; \
1397 TRIE_LIST_LEN( state ) = 4; \
1398} STMT_END
07be1b83 1399
786e8c11
YO
1400#define TRIE_HANDLE_WORD(state) STMT_START { \
1401 U16 dupe= trie->states[ state ].wordnum; \
1402 regnode * const noper_next = regnext( noper ); \
1403 \
786e8c11
YO
1404 DEBUG_r({ \
1405 /* store the word for dumping */ \
1406 SV* tmp; \
1407 if (OP(noper) != NOTHING) \
740cce10 1408 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1409 else \
740cce10 1410 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1411 av_push( trie_words, tmp ); \
786e8c11
YO
1412 }); \
1413 \
1414 curword++; \
2e64971a
DM
1415 trie->wordinfo[curword].prev = 0; \
1416 trie->wordinfo[curword].len = wordlen; \
1417 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1418 \
1419 if ( noper_next < tail ) { \
1420 if (!trie->jump) \
c944940b 1421 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1422 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1423 if (!jumper) \
1424 jumper = noper_next; \
1425 if (!nextbranch) \
1426 nextbranch= regnext(cur); \
1427 } \
1428 \
1429 if ( dupe ) { \
2e64971a
DM
1430 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1431 /* chain, so that when the bits of chain are later */\
1432 /* linked together, the dups appear in the chain */\
1433 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1434 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1435 } else { \
1436 /* we haven't inserted this word yet. */ \
1437 trie->states[ state ].wordnum = curword; \
1438 } \
1439} STMT_END
07be1b83 1440
3dab1dad 1441
786e8c11
YO
1442#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1443 ( ( base + charid >= ucharcount \
1444 && base + charid < ubound \
1445 && state == trie->trans[ base - ucharcount + charid ].check \
1446 && trie->trans[ base - ucharcount + charid ].next ) \
1447 ? trie->trans[ base - ucharcount + charid ].next \
1448 : ( state==1 ? special : 0 ) \
1449 )
3dab1dad 1450
786e8c11
YO
1451#define MADE_TRIE 1
1452#define MADE_JUMP_TRIE 2
1453#define MADE_EXACT_TRIE 4
3dab1dad 1454
a3621e74 1455STATIC I32
786e8c11 1456S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1457{
27da23d5 1458 dVAR;
a3621e74
YO
1459 /* first pass, loop through and scan words */
1460 reg_trie_data *trie;
55eed653 1461 HV *widecharmap = NULL;
2b8b4781 1462 AV *revcharmap = newAV();
a3621e74 1463 regnode *cur;
9f7f3913 1464 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1465 STRLEN len = 0;
1466 UV uvc = 0;
1467 U16 curword = 0;
1468 U32 next_alloc = 0;
786e8c11
YO
1469 regnode *jumper = NULL;
1470 regnode *nextbranch = NULL;
7f69552c 1471 regnode *convert = NULL;
2e64971a 1472 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1473 /* we just use folder as a flag in utf8 */
1e696034 1474 const U8 * folder = NULL;
a3621e74 1475
2b8b4781
NC
1476#ifdef DEBUGGING
1477 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1478 AV *trie_words = NULL;
1479 /* along with revcharmap, this only used during construction but both are
1480 * useful during debugging so we store them in the struct when debugging.
8e11feef 1481 */
2b8b4781
NC
1482#else
1483 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1484 STRLEN trie_charcount=0;
3dab1dad 1485#endif
2b8b4781 1486 SV *re_trie_maxbuff;
a3621e74 1487 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1488
1489 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1490#ifndef DEBUGGING
1491 PERL_UNUSED_ARG(depth);
1492#endif
a3621e74 1493
1e696034 1494 switch (flags) {
2f7f8cb1 1495 case EXACTFA:
1e696034
KW
1496 case EXACTFU: folder = PL_fold_latin1; break;
1497 case EXACTF: folder = PL_fold; break;
1498 case EXACTFL: folder = PL_fold_locale; break;
1499 }
1500
c944940b 1501 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1502 trie->refcount = 1;
3dab1dad 1503 trie->startstate = 1;
786e8c11 1504 trie->wordcount = word_count;
f8fc2ecf 1505 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1506 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1507 if (!(UTF && folder))
c944940b 1508 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1509 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1510 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1511
a3621e74 1512 DEBUG_r({
2b8b4781 1513 trie_words = newAV();
a3621e74 1514 });
a3621e74 1515
0111c4fd 1516 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1517 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1518 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1519 }
3dab1dad
YO
1520 DEBUG_OPTIMISE_r({
1521 PerlIO_printf( Perl_debug_log,
786e8c11 1522 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1523 (int)depth * 2 + 2, "",
1524 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1525 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1526 (int)depth);
3dab1dad 1527 });
7f69552c
YO
1528
1529 /* Find the node we are going to overwrite */
1530 if ( first == startbranch && OP( last ) != BRANCH ) {
1531 /* whole branch chain */
1532 convert = first;
1533 } else {
1534 /* branch sub-chain */
1535 convert = NEXTOPER( first );
1536 }
1537
a3621e74
YO
1538 /* -- First loop and Setup --
1539
1540 We first traverse the branches and scan each word to determine if it
1541 contains widechars, and how many unique chars there are, this is
1542 important as we have to build a table with at least as many columns as we
1543 have unique chars.
1544
1545 We use an array of integers to represent the character codes 0..255
38a44b82 1546 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1547 native representation of the character value as the key and IV's for the
1548 coded index.
1549
1550 *TODO* If we keep track of how many times each character is used we can
1551 remap the columns so that the table compression later on is more
3b753521 1552 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1553 middle and the least common are on the outside. IMO this would be better
1554 than a most to least common mapping as theres a decent chance the most
1555 common letter will share a node with the least common, meaning the node
486ec47a 1556 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1557 case is when we have the least common nodes twice.
1558
1559 */
1560
a3621e74 1561 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1562 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1563 const U8 *uc = (U8*)STRING( noper );
a28509cc 1564 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1565 STRLEN foldlen = 0;
1566 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1567 const U8 *scan = (U8*)NULL;
07be1b83 1568 U32 wordlen = 0; /* required init */
02daf0ab
YO
1569 STRLEN chars = 0;
1570 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1571
3dab1dad
YO
1572 if (OP(noper) == NOTHING) {
1573 trie->minlen= 0;
1574 continue;
1575 }
02daf0ab
YO
1576 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1577 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1578 regardless of encoding */
1579
a3621e74 1580 for ( ; uc < e ; uc += len ) {
3dab1dad 1581 TRIE_CHARCOUNT(trie)++;
a3621e74 1582 TRIE_READ_CHAR;
3dab1dad 1583 chars++;
a3621e74
YO
1584 if ( uvc < 256 ) {
1585 if ( !trie->charmap[ uvc ] ) {
1586 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1587 if ( folder )
1588 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1589 TRIE_STORE_REVCHAR;
a3621e74 1590 }
02daf0ab 1591 if ( set_bit ) {
62012aee
KW
1592 /* store the codepoint in the bitmap, and its folded
1593 * equivalent. */
02daf0ab 1594 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1595
1596 /* store the folded codepoint */
1597 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1598
1599 if ( !UTF ) {
1600 /* store first byte of utf8 representation of
acdf4139
KW
1601 variant codepoints */
1602 if (! UNI_IS_INVARIANT(uvc)) {
1603 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1604 }
1605 }
02daf0ab
YO
1606 set_bit = 0; /* We've done our bit :-) */
1607 }
a3621e74
YO
1608 } else {
1609 SV** svpp;
55eed653
NC
1610 if ( !widecharmap )
1611 widecharmap = newHV();
a3621e74 1612
55eed653 1613 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1614
1615 if ( !svpp )
e4584336 1616 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1617
1618 if ( !SvTRUE( *svpp ) ) {
1619 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1620 TRIE_STORE_REVCHAR;
a3621e74
YO
1621 }
1622 }
1623 }
3dab1dad
YO
1624 if( cur == first ) {
1625 trie->minlen=chars;
1626 trie->maxlen=chars;
1627 } else if (chars < trie->minlen) {
1628 trie->minlen=chars;
1629 } else if (chars > trie->maxlen) {
1630 trie->maxlen=chars;
1631 }
1632
a3621e74
YO
1633 } /* end first pass */
1634 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1635 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1636 (int)depth * 2 + 2,"",
55eed653 1637 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1638 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1639 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1640 );
a3621e74
YO
1641
1642 /*
1643 We now know what we are dealing with in terms of unique chars and
1644 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1645 representation using a flat table will take. If it's over a reasonable
1646 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1647 conservative but potentially much slower representation using an array
1648 of lists.
1649
1650 At the end we convert both representations into the same compressed
1651 form that will be used in regexec.c for matching with. The latter
1652 is a form that cannot be used to construct with but has memory
1653 properties similar to the list form and access properties similar
1654 to the table form making it both suitable for fast searches and
1655 small enough that its feasable to store for the duration of a program.
1656
1657 See the comment in the code where the compressed table is produced
1658 inplace from the flat tabe representation for an explanation of how
1659 the compression works.
1660
1661 */
1662
1663
2e64971a
DM
1664 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1665 prev_states[1] = 0;
1666
3dab1dad 1667 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1668 /*
1669 Second Pass -- Array Of Lists Representation
1670
1671 Each state will be represented by a list of charid:state records
1672 (reg_trie_trans_le) the first such element holds the CUR and LEN
1673 points of the allocated array. (See defines above).
1674
1675 We build the initial structure using the lists, and then convert
1676 it into the compressed table form which allows faster lookups
1677 (but cant be modified once converted).
a3621e74
YO
1678 */
1679
a3621e74
YO
1680 STRLEN transcount = 1;
1681
1e2e3d02
YO
1682 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1683 "%*sCompiling trie using list compiler\n",
1684 (int)depth * 2 + 2, ""));
446bd890 1685
c944940b
JH
1686 trie->states = (reg_trie_state *)
1687 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1688 sizeof(reg_trie_state) );
a3621e74
YO
1689 TRIE_LIST_NEW(1);
1690 next_alloc = 2;
1691
1692 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1693
c445ea15
AL
1694 regnode * const noper = NEXTOPER( cur );
1695 U8 *uc = (U8*)STRING( noper );
1696 const U8 * const e = uc + STR_LEN( noper );
1697 U32 state = 1; /* required init */
1698 U16 charid = 0; /* sanity init */
1699 U8 *scan = (U8*)NULL; /* sanity init */
1700 STRLEN foldlen = 0; /* required init */
07be1b83 1701 U32 wordlen = 0; /* required init */
c445ea15
AL
1702 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1703
3dab1dad 1704 if (OP(noper) != NOTHING) {
786e8c11 1705 for ( ; uc < e ; uc += len ) {
c445ea15 1706
786e8c11 1707 TRIE_READ_CHAR;
c445ea15 1708
786e8c11
YO
1709 if ( uvc < 256 ) {
1710 charid = trie->charmap[ uvc ];
c445ea15 1711 } else {
55eed653 1712 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1713 if ( !svpp ) {
1714 charid = 0;
1715 } else {
1716 charid=(U16)SvIV( *svpp );
1717 }
c445ea15 1718 }
786e8c11
YO
1719 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1720 if ( charid ) {
a3621e74 1721
786e8c11
YO
1722 U16 check;
1723 U32 newstate = 0;
a3621e74 1724
786e8c11
YO
1725 charid--;
1726 if ( !trie->states[ state ].trans.list ) {
1727 TRIE_LIST_NEW( state );
c445ea15 1728 }
786e8c11
YO
1729 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1730 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1731 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1732 break;
1733 }
1734 }
1735 if ( ! newstate ) {
1736 newstate = next_alloc++;
2e64971a 1737 prev_states[newstate] = state;
786e8c11
YO
1738 TRIE_LIST_PUSH( state, charid, newstate );
1739 transcount++;
1740 }
1741 state = newstate;
1742 } else {
1743 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1744 }
a28509cc 1745 }
c445ea15 1746 }
3dab1dad 1747 TRIE_HANDLE_WORD(state);
a3621e74
YO
1748
1749 } /* end second pass */
1750
1e2e3d02
YO
1751 /* next alloc is the NEXT state to be allocated */
1752 trie->statecount = next_alloc;
c944940b
JH
1753 trie->states = (reg_trie_state *)
1754 PerlMemShared_realloc( trie->states,
1755 next_alloc
1756 * sizeof(reg_trie_state) );
a3621e74 1757
3dab1dad 1758 /* and now dump it out before we compress it */
2b8b4781
NC
1759 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1760 revcharmap, next_alloc,
1761 depth+1)
1e2e3d02 1762 );
a3621e74 1763
c944940b
JH
1764 trie->trans = (reg_trie_trans *)
1765 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1766 {
1767 U32 state;
a3621e74
YO
1768 U32 tp = 0;
1769 U32 zp = 0;
1770
1771
1772 for( state=1 ; state < next_alloc ; state ++ ) {
1773 U32 base=0;
1774
1775 /*
1776 DEBUG_TRIE_COMPILE_MORE_r(
1777 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1778 );
1779 */
1780
1781 if (trie->states[state].trans.list) {
1782 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1783 U16 maxid=minid;
a28509cc 1784 U16 idx;
a3621e74
YO
1785
1786 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1787 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1788 if ( forid < minid ) {
1789 minid=forid;
1790 } else if ( forid > maxid ) {
1791 maxid=forid;
1792 }
a3621e74
YO
1793 }
1794 if ( transcount < tp + maxid - minid + 1) {
1795 transcount *= 2;
c944940b
JH
1796 trie->trans = (reg_trie_trans *)
1797 PerlMemShared_realloc( trie->trans,
446bd890
NC
1798 transcount
1799 * sizeof(reg_trie_trans) );
a3621e74
YO
1800 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1801 }
1802 base = trie->uniquecharcount + tp - minid;
1803 if ( maxid == minid ) {
1804 U32 set = 0;
1805 for ( ; zp < tp ; zp++ ) {
1806 if ( ! trie->trans[ zp ].next ) {
1807 base = trie->uniquecharcount + zp - minid;
1808 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1809 trie->trans[ zp ].check = state;
1810 set = 1;
1811 break;
1812 }
1813 }
1814 if ( !set ) {
1815 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1816 trie->trans[ tp ].check = state;
1817 tp++;
1818 zp = tp;
1819 }
1820 } else {
1821 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1822 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1823 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1824 trie->trans[ tid ].check = state;
1825 }
1826 tp += ( maxid - minid + 1 );
1827 }
1828 Safefree(trie->states[ state ].trans.list);
1829 }
1830 /*
1831 DEBUG_TRIE_COMPILE_MORE_r(
1832 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1833 );
1834 */
1835 trie->states[ state ].trans.base=base;
1836 }
cc601c31 1837 trie->lasttrans = tp + 1;
a3621e74
YO
1838 }
1839 } else {
1840 /*
1841 Second Pass -- Flat Table Representation.
1842
1843 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1844 We know that we will need Charcount+1 trans at most to store the data
1845 (one row per char at worst case) So we preallocate both structures
1846 assuming worst case.
1847
1848 We then construct the trie using only the .next slots of the entry
1849 structs.
1850
3b753521 1851 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1852 make compression both faster and easier by keeping track of how many non
1853 zero fields are in the node.
1854
1855 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1856 transition.
1857
1858 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1859 number representing the first entry of the node, and state as a
1860 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1861 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1862 are 2 entrys per node. eg:
1863
1864 A B A B
1865 1. 2 4 1. 3 7
1866 2. 0 3 3. 0 5
1867 3. 0 0 5. 0 0
1868 4. 0 0 7. 0 0
1869
1870 The table is internally in the right hand, idx form. However as we also
1871 have to deal with the states array which is indexed by nodenum we have to
1872 use TRIE_NODENUM() to convert.
1873
1874 */
1e2e3d02
YO
1875 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1876 "%*sCompiling trie using table compiler\n",
1877 (int)depth * 2 + 2, ""));
3dab1dad 1878
c944940b
JH
1879 trie->trans = (reg_trie_trans *)
1880 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1881 * trie->uniquecharcount + 1,
1882 sizeof(reg_trie_trans) );
1883 trie->states = (reg_trie_state *)
1884 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1885 sizeof(reg_trie_state) );
a3621e74
YO
1886 next_alloc = trie->uniquecharcount + 1;
1887
3dab1dad 1888
a3621e74
YO
1889 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1890
c445ea15 1891 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1892 const U8 *uc = (U8*)STRING( noper );
1893 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1894
1895 U32 state = 1; /* required init */
1896
1897 U16 charid = 0; /* sanity init */
1898 U32 accept_state = 0; /* sanity init */
1899 U8 *scan = (U8*)NULL; /* sanity init */
1900
1901 STRLEN foldlen = 0; /* required init */
07be1b83 1902 U32 wordlen = 0; /* required init */
a3621e74
YO
1903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1904
3dab1dad 1905 if ( OP(noper) != NOTHING ) {
786e8c11 1906 for ( ; uc < e ; uc += len ) {
a3621e74 1907
786e8c11 1908 TRIE_READ_CHAR;
a3621e74 1909
786e8c11
YO
1910 if ( uvc < 256 ) {
1911 charid = trie->charmap[ uvc ];
1912 } else {
55eed653 1913 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1914 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1915 }
786e8c11
YO
1916 if ( charid ) {
1917 charid--;
1918 if ( !trie->trans[ state + charid ].next ) {
1919 trie->trans[ state + charid ].next = next_alloc;
1920 trie->trans[ state ].check++;
2e64971a
DM
1921 prev_states[TRIE_NODENUM(next_alloc)]
1922 = TRIE_NODENUM(state);
786e8c11
YO
1923 next_alloc += trie->uniquecharcount;
1924 }
1925 state = trie->trans[ state + charid ].next;
1926 } else {
1927 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1928 }
1929 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1930 }
a3621e74 1931 }
3dab1dad
YO
1932 accept_state = TRIE_NODENUM( state );
1933 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1934
1935 } /* end second pass */
1936
3dab1dad 1937 /* and now dump it out before we compress it */
2b8b4781
NC
1938 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1939 revcharmap,
1940 next_alloc, depth+1));
a3621e74 1941
a3621e74
YO
1942 {
1943 /*
1944 * Inplace compress the table.*
1945
1946 For sparse data sets the table constructed by the trie algorithm will
1947 be mostly 0/FAIL transitions or to put it another way mostly empty.
1948 (Note that leaf nodes will not contain any transitions.)
1949
1950 This algorithm compresses the tables by eliminating most such
1951 transitions, at the cost of a modest bit of extra work during lookup:
1952
1953 - Each states[] entry contains a .base field which indicates the
1954 index in the state[] array wheres its transition data is stored.
1955
3b753521 1956 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1957
1958 - If .base is nonzero then charid is added to it to find an entry in
1959 the trans array.
1960
1961 -If trans[states[state].base+charid].check!=state then the
1962 transition is taken to be a 0/Fail transition. Thus if there are fail
1963 transitions at the front of the node then the .base offset will point
1964 somewhere inside the previous nodes data (or maybe even into a node
1965 even earlier), but the .check field determines if the transition is
1966 valid.
1967
786e8c11 1968 XXX - wrong maybe?
a3621e74 1969 The following process inplace converts the table to the compressed
3b753521 1970 table: We first do not compress the root node 1,and mark all its
a3621e74 1971 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1972 allows us to do a DFA construction from the compressed table later,
1973 and ensures that any .base pointers we calculate later are greater
1974 than 0.
a3621e74
YO
1975
1976 - We set 'pos' to indicate the first entry of the second node.
1977
1978 - We then iterate over the columns of the node, finding the first and
1979 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1980 and set the .check pointers accordingly, and advance pos
1981 appropriately and repreat for the next node. Note that when we copy
1982 the next pointers we have to convert them from the original
1983 NODEIDX form to NODENUM form as the former is not valid post
1984 compression.
1985
1986 - If a node has no transitions used we mark its base as 0 and do not
1987 advance the pos pointer.
1988
1989 - If a node only has one transition we use a second pointer into the
1990 structure to fill in allocated fail transitions from other states.
1991 This pointer is independent of the main pointer and scans forward
1992 looking for null transitions that are allocated to a state. When it
1993 finds one it writes the single transition into the "hole". If the
786e8c11 1994 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1995
1996 - Once compressed we can Renew/realloc the structures to release the
1997 excess space.
1998
1999 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2000 specifically Fig 3.47 and the associated pseudocode.
2001
2002 demq
2003 */
a3b680e6 2004 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2005 U32 state, charid;
a3621e74 2006 U32 pos = 0, zp=0;
1e2e3d02 2007 trie->statecount = laststate;
a3621e74
YO
2008
2009 for ( state = 1 ; state < laststate ; state++ ) {
2010 U8 flag = 0;
a28509cc
AL
2011 const U32 stateidx = TRIE_NODEIDX( state );
2012 const U32 o_used = trie->trans[ stateidx ].check;
2013 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2014 trie->trans[ stateidx ].check = 0;
2015
2016 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2017 if ( flag || trie->trans[ stateidx + charid ].next ) {
2018 if ( trie->trans[ stateidx + charid ].next ) {
2019 if (o_used == 1) {
2020 for ( ; zp < pos ; zp++ ) {
2021 if ( ! trie->trans[ zp ].next ) {
2022 break;
2023 }
2024 }
2025 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2026 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2027 trie->trans[ zp ].check = state;
2028 if ( ++zp > pos ) pos = zp;
2029 break;
2030 }
2031 used--;
2032 }
2033 if ( !flag ) {
2034 flag = 1;
2035 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2036 }
2037 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2038 trie->trans[ pos ].check = state;
2039 pos++;
2040 }
2041 }
2042 }
cc601c31 2043 trie->lasttrans = pos + 1;
c944940b
JH
2044 trie->states = (reg_trie_state *)
2045 PerlMemShared_realloc( trie->states, laststate
2046 * sizeof(reg_trie_state) );
a3621e74 2047 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2048 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2049 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2050 (int)depth * 2 + 2,"",
2051 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2052 (IV)next_alloc,
2053 (IV)pos,
a3621e74
YO
2054 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2055 );
2056
2057 } /* end table compress */
2058 }
1e2e3d02
YO
2059 DEBUG_TRIE_COMPILE_MORE_r(
2060 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2061 (int)depth * 2 + 2, "",
2062 (UV)trie->statecount,
2063 (UV)trie->lasttrans)
2064 );
cc601c31 2065 /* resize the trans array to remove unused space */
c944940b
JH
2066 trie->trans = (reg_trie_trans *)
2067 PerlMemShared_realloc( trie->trans, trie->lasttrans
2068 * sizeof(reg_trie_trans) );
a3621e74 2069
3b753521 2070 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2071 U8 nodetype =(U8)(flags & 0xFF);
2072 char *str=NULL;
786e8c11 2073
07be1b83 2074#ifdef DEBUGGING
e62cc96a 2075 regnode *optimize = NULL;
7122b237
YO
2076#ifdef RE_TRACK_PATTERN_OFFSETS
2077
b57a0404
JH
2078 U32 mjd_offset = 0;
2079 U32 mjd_nodelen = 0;
7122b237
YO
2080#endif /* RE_TRACK_PATTERN_OFFSETS */
2081#endif /* DEBUGGING */
a3621e74 2082 /*
3dab1dad
YO
2083 This means we convert either the first branch or the first Exact,
2084 depending on whether the thing following (in 'last') is a branch
2085 or not and whther first is the startbranch (ie is it a sub part of
2086 the alternation or is it the whole thing.)
3b753521 2087 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2088 the whole branch sequence, including the first.
a3621e74 2089 */
3dab1dad 2090 /* Find the node we are going to overwrite */
7f69552c 2091 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2092 /* branch sub-chain */
3dab1dad 2093 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2094#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2095 DEBUG_r({
2096 mjd_offset= Node_Offset((convert));
2097 mjd_nodelen= Node_Length((convert));
2098 });
7122b237 2099#endif
7f69552c 2100 /* whole branch chain */
7122b237
YO
2101 }
2102#ifdef RE_TRACK_PATTERN_OFFSETS
2103 else {
7f69552c
YO
2104 DEBUG_r({
2105 const regnode *nop = NEXTOPER( convert );
2106 mjd_offset= Node_Offset((nop));
2107 mjd_nodelen= Node_Length((nop));
2108 });
07be1b83
YO
2109 }
2110 DEBUG_OPTIMISE_r(
2111 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2112 (int)depth * 2 + 2, "",
786e8c11 2113 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2114 );
7122b237 2115#endif
3dab1dad
YO
2116 /* But first we check to see if there is a common prefix we can
2117 split out as an EXACT and put in front of the TRIE node. */
2118 trie->startstate= 1;
55eed653 2119 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2120 U32 state;
1e2e3d02 2121 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2122 U32 ofs = 0;
8e11feef
RGS
2123 I32 idx = -1;
2124 U32 count = 0;
2125 const U32 base = trie->states[ state ].trans.base;
a3621e74 2126
3dab1dad 2127 if ( trie->states[state].wordnum )
8e11feef 2128 count = 1;
a3621e74 2129
8e11feef 2130 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2131 if ( ( base + ofs >= trie->uniquecharcount ) &&
2132 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2133 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2134 {
3dab1dad 2135 if ( ++count > 1 ) {
2b8b4781 2136 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2137 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2138 if ( state == 1 ) break;
3dab1dad
YO
2139 if ( count == 2 ) {
2140 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2141 DEBUG_OPTIMISE_r(
8e11feef
RGS
2142 PerlIO_printf(Perl_debug_log,
2143 "%*sNew Start State=%"UVuf" Class: [",
2144 (int)depth * 2 + 2, "",
786e8c11 2145 (UV)state));
be8e71aa 2146 if (idx >= 0) {
2b8b4781 2147 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2148 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2149
3dab1dad 2150 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2151 if ( folder )
2152 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2153 DEBUG_OPTIMISE_r(
f1f66076 2154 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2155 );
8e11feef
RGS
2156 }
2157 }
2158 TRIE_BITMAP_SET(trie,*ch);
2159 if ( folder )
2160 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2161 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2162 }
2163 idx = ofs;
2164 }
3dab1dad
YO
2165 }
2166 if ( count == 1 ) {
2b8b4781 2167 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2168 STRLEN len;
2169 char *ch = SvPV( *tmp, len );
de734bd5
A
2170 DEBUG_OPTIMISE_r({
2171 SV *sv=sv_newmortal();
8e11feef
RGS
2172 PerlIO_printf( Perl_debug_log,
2173 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2174 (int)depth * 2 + 2, "",
de734bd5
A
2175 (UV)state, (UV)idx,
2176 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2177 PL_colors[0], PL_colors[1],
2178 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2179 PERL_PV_ESCAPE_FIRSTCHAR
2180 )
2181 );
2182 });
3dab1dad
YO
2183 if ( state==1 ) {
2184 OP( convert ) = nodetype;
2185 str=STRING(convert);
2186 STR_LEN(convert)=0;
2187 }
c490c714
YO
2188 STR_LEN(convert) += len;
2189 while (len--)
de734bd5 2190 *str++ = *ch++;
8e11feef 2191 } else {
f9049ba1 2192#ifdef DEBUGGING
8e11feef
RGS
2193 if (state>1)
2194 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2195#endif
8e11feef
RGS
2196 break;
2197 }
2198 }
2e64971a 2199 trie->prefixlen = (state-1);
3dab1dad 2200 if (str) {
8e11feef 2201 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2202 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2203 trie->startstate = state;
07be1b83
YO
2204 trie->minlen -= (state - 1);
2205 trie->maxlen -= (state - 1);
33809eae
JH
2206#ifdef DEBUGGING
2207 /* At least the UNICOS C compiler choked on this
2208 * being argument to DEBUG_r(), so let's just have
2209 * it right here. */
2210 if (
2211#ifdef PERL_EXT_RE_BUILD
2212 1
2213#else
2214 DEBUG_r_TEST
2215#endif
2216 ) {
2217 regnode *fix = convert;
2218 U32 word = trie->wordcount;
2219 mjd_nodelen++;
2220 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2221 while( ++fix < n ) {
2222 Set_Node_Offset_Length(fix, 0, 0);
2223 }
2224 while (word--) {
2225 SV ** const tmp = av_fetch( trie_words, word, 0 );
2226 if (tmp) {
2227 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2228 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2229 else
2230 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2231 }
2232 }
2233 }
2234#endif
8e11feef
RGS
2235 if (trie->maxlen) {
2236 convert = n;
2237 } else {
3dab1dad 2238 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2239 DEBUG_r(optimize= n);
3dab1dad
YO
2240 }
2241 }
2242 }
a5ca303d
YO
2243 if (!jumper)
2244 jumper = last;
3dab1dad 2245 if ( trie->maxlen ) {
8e11feef
RGS
2246 NEXT_OFF( convert ) = (U16)(tail - convert);
2247 ARG_SET( convert, data_slot );
786e8c11
YO
2248 /* Store the offset to the first unabsorbed branch in
2249 jump[0], which is otherwise unused by the jump logic.
2250 We use this when dumping a trie and during optimisation. */
2251 if (trie->jump)
7f69552c 2252 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2253
6c48061a
YO
2254 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2255 * and there is a bitmap
2256 * and the first "jump target" node we found leaves enough room
2257 * then convert the TRIE node into a TRIEC node, with the bitmap
2258 * embedded inline in the opcode - this is hypothetically faster.
2259 */
2260 if ( !trie->states[trie->startstate].wordnum
2261 && trie->bitmap
2262 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2263 {
2264 OP( convert ) = TRIEC;
2265 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2266 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2267 trie->bitmap= NULL;
2268 } else
2269 OP( convert ) = TRIE;
a3621e74 2270
3dab1dad
YO
2271 /* store the type in the flags */
2272 convert->flags = nodetype;
a5ca303d
YO
2273 DEBUG_r({
2274 optimize = convert
2275 + NODE_STEP_REGNODE
2276 + regarglen[ OP( convert ) ];
2277 });
2278 /* XXX We really should free up the resource in trie now,
2279 as we won't use them - (which resources?) dmq */
3dab1dad 2280 }
a3621e74 2281 /* needed for dumping*/
e62cc96a 2282 DEBUG_r(if (optimize) {
07be1b83 2283 regnode *opt = convert;
bcdf7404 2284
e62cc96a 2285 while ( ++opt < optimize) {
07be1b83
YO
2286 Set_Node_Offset_Length(opt,0,0);
2287 }
786e8c11
YO
2288 /*
2289 Try to clean up some of the debris left after the
2290 optimisation.
a3621e74 2291 */
786e8c11 2292 while( optimize < jumper ) {
07be1b83 2293 mjd_nodelen += Node_Length((optimize));
a3621e74 2294 OP( optimize ) = OPTIMIZED;
07be1b83 2295 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2296 optimize++;
2297 }
07be1b83 2298 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2299 });
2300 } /* end node insert */
2e64971a
DM
2301
2302 /* Finish populating the prev field of the wordinfo array. Walk back
2303 * from each accept state until we find another accept state, and if
2304 * so, point the first word's .prev field at the second word. If the
2305 * second already has a .prev field set, stop now. This will be the
2306 * case either if we've already processed that word's accept state,
3b753521
FN
2307 * or that state had multiple words, and the overspill words were
2308 * already linked up earlier.
2e64971a
DM
2309 */
2310 {
2311 U16 word;
2312 U32 state;
2313 U16 prev;
2314
2315 for (word=1; word <= trie->wordcount; word++) {
2316 prev = 0;
2317 if (trie->wordinfo[word].prev)
2318 continue;
2319 state = trie->wordinfo[word].accept;
2320 while (state) {
2321 state = prev_states[state];
2322 if (!state)
2323 break;
2324 prev = trie->states[state].wordnum;
2325 if (prev)
2326 break;
2327 }
2328 trie->wordinfo[word].prev = prev;
2329 }
2330 Safefree(prev_states);
2331 }
2332
2333
2334 /* and now dump out the compressed format */
2335 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2336
55eed653 2337 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2338#ifdef DEBUGGING
2339 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2340 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2341#else
2342 SvREFCNT_dec(revcharmap);
07be1b83 2343#endif
786e8c11
YO
2344 return trie->jump
2345 ? MADE_JUMP_TRIE
2346 : trie->startstate>1
2347 ? MADE_EXACT_TRIE
2348 : MADE_TRIE;
2349}
2350
2351STATIC void
2352S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2353{
3b753521 2354/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2355
2356 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2357 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2358 ISBN 0-201-10088-6
2359
2360 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2361 suffix of the current state's 'word' that is also a proper prefix of another word in our
2362 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2363 the DFA not to have to restart after its tried and failed a word at a given point, it
2364 simply continues as though it had been matching the other word in the first place.
2365 Consider
2366 'abcdgu'=~/abcdefg|cdgu/
2367 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2368 fail, which would bring us to the state representing 'd' in the second word where we would
2369 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2370 */
2371 /* add a fail transition */
3251b653
NC
2372 const U32 trie_offset = ARG(source);
2373 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2374 U32 *q;
2375 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2376 const U32 numstates = trie->statecount;
786e8c11
YO
2377 const U32 ubound = trie->lasttrans + ucharcount;
2378 U32 q_read = 0;
2379 U32 q_write = 0;
2380 U32 charid;
2381 U32 base = trie->states[ 1 ].trans.base;
2382 U32 *fail;
2383 reg_ac_data *aho;
2384 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2385 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2386
2387 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2388#ifndef DEBUGGING
2389 PERL_UNUSED_ARG(depth);
2390#endif
2391
2392
2393 ARG_SET( stclass, data_slot );
c944940b 2394 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2395 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2396 aho->trie=trie_offset;
446bd890
NC
2397 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2398 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2399 Newxz( q, numstates, U32);
c944940b 2400 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2401 aho->refcount = 1;
2402 fail = aho->fail;
2403 /* initialize fail[0..1] to be 1 so that we always have
2404 a valid final fail state */
2405 fail[ 0 ] = fail[ 1 ] = 1;
2406
2407 for ( charid = 0; charid < ucharcount ; charid++ ) {
2408 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2409 if ( newstate ) {
2410 q[ q_write ] = newstate;
2411 /* set to point at the root */
2412 fail[ q[ q_write++ ] ]=1;
2413 }
2414 }
2415 while ( q_read < q_write) {
2416 const U32 cur = q[ q_read++ % numstates ];
2417 base = trie->states[ cur ].trans.base;
2418
2419 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2420 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2421 if (ch_state) {
2422 U32 fail_state = cur;
2423 U32 fail_base;
2424 do {
2425 fail_state = fail[ fail_state ];
2426 fail_base = aho->states[ fail_state ].trans.base;
2427 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2428
2429 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2430 fail[ ch_state ] = fail_state;
2431 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2432 {
2433 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2434 }
2435 q[ q_write++ % numstates] = ch_state;
2436 }
2437 }
2438 }
2439 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2440 when we fail in state 1, this allows us to use the
2441 charclass scan to find a valid start char. This is based on the principle
2442 that theres a good chance the string being searched contains lots of stuff
2443 that cant be a start char.
2444 */
2445 fail[ 0 ] = fail[ 1 ] = 0;
2446 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2447 PerlIO_printf(Perl_debug_log,
2448 "%*sStclass Failtable (%"UVuf" states): 0",
2449 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2450 );
786e8c11
YO
2451 for( q_read=1; q_read<numstates; q_read++ ) {
2452 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2453 }
2454 PerlIO_printf(Perl_debug_log, "\n");
2455 });
2456 Safefree(q);
2457 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2458}
2459
786e8c11 2460
a3621e74 2461/*
5d1c421c
JH
2462 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2463 * These need to be revisited when a newer toolchain becomes available.
2464 */
2465#if defined(__sparc64__) && defined(__GNUC__)
2466# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2467# undef SPARC64_GCC_WORKAROUND
2468# define SPARC64_GCC_WORKAROUND 1
2469# endif
2470#endif
2471
07be1b83 2472#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2473 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2474 SV * const mysv=sv_newmortal(); \
2475 regnode *Next = regnext(scan); \
2476 regprop(RExC_rx, mysv, scan); \
7f69552c 2477 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2478 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2479 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2480 }});
07be1b83 2481
1de06328
YO
2482
2483
2484
2485
07be1b83
YO
2486#define JOIN_EXACT(scan,min,flags) \
2487 if (PL_regkind[OP(scan)] == EXACT) \
2488 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2489
be8e71aa 2490STATIC U32
07be1b83
YO
2491S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2492 /* Merge several consecutive EXACTish nodes into one. */
2493 regnode *n = regnext(scan);
2494 U32 stringok = 1;
2495 regnode *next = scan + NODE_SZ_STR(scan);
2496 U32 merged = 0;
2497 U32 stopnow = 0;
2498#ifdef DEBUGGING
2499 regnode *stop = scan;
72f13be8 2500 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2501#else
d47053eb
RGS
2502 PERL_UNUSED_ARG(depth);
2503#endif
7918f24d
NC
2504
2505 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2506#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2507 PERL_UNUSED_ARG(flags);
2508 PERL_UNUSED_ARG(val);
07be1b83 2509#endif
07be1b83
YO
2510 DEBUG_PEEP("join",scan,depth);
2511
2512 /* Skip NOTHING, merge EXACT*. */
2513 while (n &&
2514 ( PL_regkind[OP(n)] == NOTHING ||
2515 (stringok && (OP(n) == OP(scan))))
2516 && NEXT_OFF(n)
2517 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2518
2519 if (OP(n) == TAIL || n > next)
2520 stringok = 0;
2521 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2522 DEBUG_PEEP("skip:",n,depth);
2523 NEXT_OFF(scan) += NEXT_OFF(n);
2524 next = n + NODE_STEP_REGNODE;
2525#ifdef DEBUGGING
2526 if (stringok)
2527 stop = n;
2528#endif
2529 n = regnext(n);
2530 }
2531 else if (stringok) {
786e8c11 2532 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2533 regnode * const nnext = regnext(n);
2534
2535 DEBUG_PEEP("merg",n,depth);
2536
2537 merged++;
2538 if (oldl + STR_LEN(n) > U8_MAX)
2539 break;
2540 NEXT_OFF(scan) += NEXT_OFF(n);
2541 STR_LEN(scan) += STR_LEN(n);
2542 next = n + NODE_SZ_STR(n);
2543 /* Now we can overwrite *n : */
2544 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2545#ifdef DEBUGGING
2546 stop = next - 1;
2547#endif
2548 n = nnext;
2549 if (stopnow) break;
2550 }
2551
d47053eb
RGS
2552#ifdef EXPERIMENTAL_INPLACESCAN
2553 if (flags && !NEXT_OFF(n)) {
2554 DEBUG_PEEP("atch", val, depth);
2555 if (reg_off_by_arg[OP(n)]) {
2556 ARG_SET(n, val - n);
2557 }
2558 else {
2559 NEXT_OFF(n) = val - n;
2560 }
2561 stopnow = 1;
2562 }
07be1b83
YO
2563#endif
2564 }
ced7f090
KW
2565#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2566#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2567#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2568#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2569
2570 if (UTF
2f7f8cb1 2571 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2c2b7f86
KW
2572 && ( STR_LEN(scan) >= 6 ) )
2573 {
07be1b83
YO
2574 /*
2575 Two problematic code points in Unicode casefolding of EXACT nodes:
2576
2577 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2578 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2579
2580 which casefold to
2581
2582 Unicode UTF-8
2583
2584 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2585 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2586
2587 This means that in case-insensitive matching (or "loose matching",
2588 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2589 length of the above casefolded versions) can match a target string
2590 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2591 This would rather mess up the minimum length computation.
2592
2593 What we'll do is to look for the tail four bytes, and then peek
2594 at the preceding two bytes to see whether we need to decrease
2595 the minimum length by four (six minus two).
2596
2597 Thanks to the design of UTF-8, there cannot be false matches:
2598 A sequence of valid UTF-8 bytes cannot be a subsequence of
2599 another valid sequence of UTF-8 bytes.
2600
2601 */
2602 char * const s0 = STRING(scan), *s, *t;
2603 char * const s1 = s0 + STR_LEN(scan) - 1;
2604 char * const s2 = s1 - 4;
e294cc5d
JH
2605#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2606 const char t0[] = "\xaf\x49\xaf\x42";
2607#else
07be1b83 2608 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2609#endif
07be1b83
YO
2610 const char * const t1 = t0 + 3;
2611
2612 for (s = s0 + 2;
2613 s < s2 && (t = ninstr(s, s1, t0, t1));
2614 s = t + 4) {
e294cc5d
JH
2615#ifdef EBCDIC
2616 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2617 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2618#else
07be1b83
YO
2619 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2620 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2621#endif
07be1b83
YO
2622 *min -= 4;
2623 }
2624 }
2625
2626#ifdef DEBUGGING
2627 /* Allow dumping */
2628 n = scan + NODE_SZ_STR(scan);
2629 while (n <= stop) {
2630 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2631 OP(n) = OPTIMIZED;
2632 NEXT_OFF(n) = 0;
2633 }
2634 n++;
2635 }
2636#endif
2637 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2638 return stopnow;
2639}
2640
486ec47a 2641/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2642 Finds fixed substrings. */
2643
a0288114 2644/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2645 to the position after last scanned or to NULL. */
2646
40d049e4
YO
2647#define INIT_AND_WITHP \
2648 assert(!and_withp); \
2649 Newx(and_withp,1,struct regnode_charclass_class); \
2650 SAVEFREEPV(and_withp)
07be1b83 2651
b515a41d 2652/* this is a chain of data about sub patterns we are processing that
486ec47a 2653 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2654 we can simulate recursion without losing state. */
2655struct scan_frame;
2656typedef struct scan_frame {
2657 regnode *last; /* last node to process in this frame */
2658 regnode *next; /* next node to process when last is reached */
2659 struct scan_frame *prev; /*previous frame*/
2660 I32 stop; /* what stopparen do we use */
2661} scan_frame;
2662
304ee84b
YO
2663
2664#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2665
e1d1eefb
YO
2666#define CASE_SYNST_FNC(nAmE) \
2667case nAmE: \
2668 if (flags & SCF_DO_STCLASS_AND) { \
2669 for (value = 0; value < 256; value++) \
2670 if (!is_ ## nAmE ## _cp(value)) \
2671 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2672 } \
2673 else { \
2674 for (value = 0; value < 256; value++) \
2675 if (is_ ## nAmE ## _cp(value)) \
2676 ANYOF_BITMAP_SET(data->start_class, value); \
2677 } \
2678 break; \
2679case N ## nAmE: \
2680 if (flags & SCF_DO_STCLASS_AND) { \
2681 for (value = 0; value < 256; value++) \
2682 if (is_ ## nAmE ## _cp(value)) \
2683 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2684 } \
2685 else { \
2686 for (value = 0; value < 256; value++) \
2687 if (!is_ ## nAmE ## _cp(value)) \
2688 ANYOF_BITMAP_SET(data->start_class, value); \
2689 } \
2690 break
2691
2692
2693
76e3520e 2694STATIC I32
40d049e4 2695S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2696 I32 *minlenp, I32 *deltap,
40d049e4
YO
2697 regnode *last,
2698 scan_data_t *data,
2699 I32 stopparen,
2700 U8* recursed,
2701 struct regnode_charclass_class *and_withp,
2702 U32 flags, U32 depth)
c277df42
IZ
2703 /* scanp: Start here (read-write). */
2704 /* deltap: Write maxlen-minlen here. */
2705 /* last: Stop before this one. */
40d049e4
YO
2706 /* data: string data about the pattern */
2707 /* stopparen: treat close N as END */
2708 /* recursed: which subroutines have we recursed into */
2709 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2710{
97aff369 2711 dVAR;
c277df42
IZ
2712 I32 min = 0, pars = 0, code;
2713 regnode *scan = *scanp, *next;
2714 I32 delta = 0;
2715 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2716 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2717 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2718 scan_data_t data_fake;
a3621e74 2719 SV *re_trie_maxbuff = NULL;
786e8c11 2720 regnode *first_non_open = scan;
e2e6a0f1 2721 I32 stopmin = I32_MAX;
8aa23a47 2722 scan_frame *frame = NULL;
a3621e74 2723 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2724
7918f24d
NC
2725 PERL_ARGS_ASSERT_STUDY_CHUNK;
2726
13a24bad 2727#ifdef DEBUGGING
40d049e4 2728 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2729#endif
40d049e4 2730
786e8c11 2731 if ( depth == 0 ) {
40d049e4 2732 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2733 first_non_open=regnext(first_non_open);
2734 }
2735
b81d288d 2736
8aa23a47
YO
2737 fake_study_recurse:
2738 while ( scan && OP(scan) != END && scan < last ){
2739 /* Peephole optimizer: */
304ee84b 2740 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2741 DEBUG_PEEP("Peep",scan,depth);
2742 JOIN_EXACT(scan,&min,0);
2743
2744 /* Follow the next-chain of the current node and optimize
2745 away all the NOTHINGs from it. */
2746 if (OP(scan) != CURLYX) {
2747 const int max = (reg_off_by_arg[OP(scan)]
2748 ? I32_MAX
2749 /* I32 may be smaller than U16 on CRAYs! */
2750 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2751 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2752 int noff;
2753 regnode *n = scan;
2754
2755 /* Skip NOTHING and LONGJMP. */
2756 while ((n = regnext(n))
2757 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2758 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2759 && off + noff < max)
2760 off += noff;
2761 if (reg_off_by_arg[OP(scan)])
2762 ARG(scan) = off;
2763 else
2764 NEXT_OFF(scan) = off;
2765 }
a3621e74 2766
c277df42 2767
8aa23a47
YO
2768
2769 /* The principal pseudo-switch. Cannot be a switch, since we
2770 look into several different things. */
2771 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2772 || OP(scan) == IFTHEN) {
2773 next = regnext(scan);
2774 code = OP(scan);
2775 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2776
2777 if (OP(next) == code || code == IFTHEN) {
2778 /* NOTE - There is similar code to this block below for handling
2779 TRIE nodes on a re-study. If you change stuff here check there
2780 too. */
2781 I32 max1 = 0, min1 = I32_MAX, num = 0;
2782 struct regnode_charclass_class accum;
2783 regnode * const startbranch=scan;
2784
2785 if (flags & SCF_DO_SUBSTR)
304ee84b 2786 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 2787 if (flags & SCF_DO_STCLASS)
e755fd73 2788 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
2789
2790 while (OP(scan) == code) {
2791 I32 deltanext, minnext, f = 0, fake;
2792 struct regnode_charclass_class this_class;
2793
2794 num++;
2795 data_fake.flags = 0;
2796 if (data) {
2797 data_fake.whilem_c = data->whilem_c;
2798 data_fake.last_closep = data->last_closep;
2799 }
2800 else
2801 data_fake.last_closep = &fake;
58e23c8d
YO
2802
2803 data_fake.pos_delta = delta;
8aa23a47
YO
2804 next = regnext(scan);
2805 scan = NEXTOPER(scan);
2806 if (code != BRANCH)
c277df42 2807 scan = NEXTOPER(scan);
8aa23a47 2808 if (flags & SCF_DO_STCLASS) {
e755fd73 2809 cl_init(pRExC_state, &this_class);
8aa23a47
YO
2810 data_fake.start_class = &this_class;
2811 f = SCF_DO_STCLASS_AND;
58e23c8d 2812 }
8aa23a47
YO
2813 if (flags & SCF_WHILEM_VISITED_POS)
2814 f |= SCF_WHILEM_VISITED_POS;
2815
2816 /* we suppose the run is continuous, last=next...*/
2817 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2818 next, &data_fake,
2819 stopparen, recursed, NULL, f,depth+1);
2820 if (min1 > minnext)
2821 min1 = minnext;
2822 if (max1 < minnext + deltanext)
2823 max1 = minnext + deltanext;
2824 if (deltanext == I32_MAX)
2825 is_inf = is_inf_internal = 1;
2826 scan = next;
2827 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2828 pars++;
2829 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2830 if ( stopmin > minnext)
2831 stopmin = min + min1;
2832 flags &= ~SCF_DO_SUBSTR;
2833 if (data)
2834 data->flags |= SCF_SEEN_ACCEPT;
2835 }
2836 if (data) {
2837 if (data_fake.flags & SF_HAS_EVAL)
2838 data->flags |= SF_HAS_EVAL;
2839 data->whilem_c = data_fake.whilem_c;
3dab1dad 2840 }
8aa23a47 2841 if (flags & SCF_DO_STCLASS)
3fffb88a 2842 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
2843 }
2844 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2845 min1 = 0;
2846 if (flags & SCF_DO_SUBSTR) {
2847 data->pos_min += min1;
2848 data->pos_delta += max1 - min1;
2849 if (max1 != min1 || is_inf)
2850 data->longest = &(data->longest_float);
2851 }
2852 min += min1;
2853 delta += max1 - min1;
2854 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 2855 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
2856 if (min1) {
2857 cl_and(data->start_class, and_withp);
2858 flags &= ~SCF_DO_STCLASS;
653099ff 2859 }
8aa23a47
YO
2860 }
2861 else if (flags & SCF_DO_STCLASS_AND) {
2862 if (min1) {
2863 cl_and(data->start_class, &accum);
2864 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2865 }
8aa23a47
YO
2866 else {
2867 /* Switch to OR mode: cache the old value of
2868 * data->start_class */
2869 INIT_AND_WITHP;
2870 StructCopy(data->start_class, and_withp,
2871 struct regnode_charclass_class);
2872 flags &= ~SCF_DO_STCLASS_AND;
2873 StructCopy(&accum, data->start_class,
2874 struct regnode_charclass_class);
2875 flags |= SCF_DO_STCLASS_OR;
2876 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2877 }
8aa23a47 2878 }
a3621e74 2879
8aa23a47
YO
2880 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2881 /* demq.
a3621e74 2882
8aa23a47
YO
2883 Assuming this was/is a branch we are dealing with: 'scan' now
2884 points at the item that follows the branch sequence, whatever
2885 it is. We now start at the beginning of the sequence and look
2886 for subsequences of
a3621e74 2887
8aa23a47
YO
2888 BRANCH->EXACT=>x1
2889 BRANCH->EXACT=>x2
2890 tail
a3621e74 2891
8aa23a47 2892 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2893
486ec47a 2894 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2895 element into a trie and then add the subsequent branch exact
2896 strings to the trie.
a3621e74 2897
8aa23a47 2898 We have two cases
a3621e74 2899
3b753521 2900 1. patterns where the whole set of branches can be converted.
a3621e74 2901
8aa23a47 2902 2. patterns where only a subset can be converted.
a3621e74 2903
8aa23a47
YO
2904 In case 1 we can replace the whole set with a single regop
2905 for the trie. In case 2 we need to keep the start and end
3b753521 2906 branches so
a3621e74 2907
8aa23a47
YO
2908 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2909 becomes BRANCH TRIE; BRANCH X;
786e8c11 2910
8aa23a47
YO
2911 There is an additional case, that being where there is a
2912 common prefix, which gets split out into an EXACT like node
2913 preceding the TRIE node.
a3621e74 2914
8aa23a47
YO
2915 If x(1..n)==tail then we can do a simple trie, if not we make
2916 a "jump" trie, such that when we match the appropriate word
486ec47a 2917 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2918 a nested if into a case structure of sorts.
b515a41d 2919
8aa23a47
YO
2920 */
2921
2922 int made=0;
2923 if (!re_trie_maxbuff) {
2924 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2925 if (!SvIOK(re_trie_maxbuff))
2926 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2927 }
2928 if ( SvIV(re_trie_maxbuff)>=0 ) {
2929 regnode *cur;
2930 regnode *first = (regnode *)NULL;
2931 regnode *last = (regnode *)NULL;
2932 regnode *tail = scan;
2933 U8 optype = 0;
2934 U32 count=0;
a3621e74
YO
2935
2936#ifdef DEBUGGING
8aa23a47 2937 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2938#endif
8aa23a47
YO
2939 /* var tail is used because there may be a TAIL
2940 regop in the way. Ie, the exacts will point to the
2941 thing following the TAIL, but the last branch will
2942 point at the TAIL. So we advance tail. If we
2943 have nested (?:) we may have to move through several
2944 tails.
2945 */
2946
2947 while ( OP( tail ) == TAIL ) {
2948 /* this is the TAIL generated by (?:) */
2949 tail = regnext( tail );
2950 }
a3621e74 2951
8aa23a47
YO
2952
2953 DEBUG_OPTIMISE_r({
2954 regprop(RExC_rx, mysv, tail );
2955 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2956 (int)depth * 2 + 2, "",
2957 "Looking for TRIE'able sequences. Tail node is: ",
2958 SvPV_nolen_const( mysv )
2959 );
2960 });
2961
2962 /*
2963
2964 step through the branches, cur represents each
2965 branch, noper is the first thing to be matched
2966 as part of that branch and noper_next is the
2967 regnext() of that node. if noper is an EXACT
2968 and noper_next is the same as scan (our current
2969 position in the regex) then the EXACT branch is
2970 a possible optimization target. Once we have
486ec47a 2971 two or more consecutive such branches we can
8aa23a47
YO
2972 create a trie of the EXACT's contents and stich
2973 it in place. If the sequence represents all of
2974 the branches we eliminate the whole thing and
2975 replace it with a single TRIE. If it is a
2976 subsequence then we need to stitch it in. This
2977 means the first branch has to remain, and needs
2978 to be repointed at the item on the branch chain
2979 following the last branch optimized. This could
2980 be either a BRANCH, in which case the
2981 subsequence is internal, or it could be the
2982 item following the branch sequence in which
2983 case the subsequence is at the end.
2984
2985 */
2986
2987 /* dont use tail as the end marker for this traverse */
2988 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2989 regnode * const noper = NEXTOPER( cur );
b515a41d 2990#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2991 regnode * const noper_next = regnext( noper );
b515a41d
YO
2992#endif
2993
8aa23a47
YO
2994 DEBUG_OPTIMISE_r({
2995 regprop(RExC_rx, mysv, cur);
2996 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2997 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2998
2999 regprop(RExC_rx, mysv, noper);
3000 PerlIO_printf( Perl_debug_log, " -> %s",
3001 SvPV_nolen_const(mysv));
3002
3003 if ( noper_next ) {
3004 regprop(RExC_rx, mysv, noper_next );
3005 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3006 SvPV_nolen_const(mysv));
3007 }
3008 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3009 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3010 });
3011 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3012 : PL_regkind[ OP( noper ) ] == EXACT )
3013 || OP(noper) == NOTHING )
786e8c11 3014#ifdef NOJUMPTRIE
8aa23a47 3015 && noper_next == tail
786e8c11 3016#endif
8aa23a47
YO
3017 && count < U16_MAX)
3018 {
3019 count++;
3020 if ( !first || optype == NOTHING ) {
3021 if (!first) first = cur;
3022 optype = OP( noper );
3023 } else {
3024 last = cur;
3025 }
3026 } else {
a0a388a1 3027/*
0abd0d78
YO
3028 Currently we do not believe that the trie logic can
3029 handle case insensitive matching properly when the
3030 pattern is not unicode (thus forcing unicode semantics).
3031
3032 If/when this is fixed the following define can be swapped
3033 in below to fully enable trie logic.
3034
f0c16e54
KW
3035 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3036 not /aa
3037
a0a388a1 3038#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
3039
3040*/
f0c16e54 3041#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
0abd0d78 3042
a0a388a1 3043 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
3044 make_trie( pRExC_state,
3045 startbranch, first, cur, tail, count,
3046 optype, depth+1 );
3047 }
3048 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 3049#ifdef NOJUMPTRIE
8aa23a47 3050 && noper_next == tail
786e8c11 3051#endif
8aa23a47
YO
3052 ){
3053 count = 1;
3054 first = cur;
3055 optype = OP( noper );
3056 } else {
3057 count = 0;
3058 first = NULL;
3059 optype = 0;
3060 }
3061 last = NULL;
3062 }
3063 }
3064 DEBUG_OPTIMISE_r({
3065 regprop(RExC_rx, mysv, cur);
3066 PerlIO_printf( Perl_debug_log,
3067 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3068 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3069
3070 });
a0a388a1
YO
3071
3072 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 3073 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 3074#ifdef TRIE_STUDY_OPT
8aa23a47
YO
3075 if ( ((made == MADE_EXACT_TRIE &&
3076 startbranch == first)
3077 || ( first_non_open == first )) &&
3078 depth==0 ) {
3079 flags |= SCF_TRIE_RESTUDY;
3080 if ( startbranch == first
3081 && scan == tail )
3082 {
3083 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3084 }
3085 }
3dab1dad 3086#endif
8aa23a47
YO
3087 }
3088 }
3089
3090 } /* do trie */
3091
653099ff 3092 }
8aa23a47
YO
3093 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3094 scan = NEXTOPER(NEXTOPER(scan));
3095 } else /* single branch is optimized. */
3096 scan = NEXTOPER(scan);
3097 continue;
3098 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3099 scan_frame *newframe = NULL;
3100 I32 paren;
3101 regnode *start;
3102 regnode *end;
3103
3104 if (OP(scan) != SUSPEND) {
3105 /* set the pointer */
3106 if (OP(scan) == GOSUB) {
3107 paren = ARG(scan);
3108 RExC_recurse[ARG2L(scan)] = scan;
3109 start = RExC_open_parens[paren-1];
3110 end = RExC_close_parens[paren-1];
3111 } else {
3112 paren = 0;
f8fc2ecf 3113 start = RExC_rxi->program + 1;
8aa23a47
YO
3114 end = RExC_opend;
3115 }
3116 if (!recursed) {
3117 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3118 SAVEFREEPV(recursed);
3119 }
3120 if (!PAREN_TEST(recursed,paren+1)) {
3121 PAREN_SET(recursed,paren+1);
3122 Newx(newframe,1,scan_frame);
3123 } else {
3124 if (flags & SCF_DO_SUBSTR) {
304ee84b 3125 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3126 data->longest = &(data->longest_float);
3127 }
3128 is_inf = is_inf_internal = 1;
3129 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3130 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3131 flags &= ~SCF_DO_STCLASS;
3132 }
3133 } else {
3134 Newx(newframe,1,scan_frame);
3135 paren = stopparen;
3136 start = scan+2;
3137 end = regnext(scan);
3138 }
3139 if (newframe) {
3140 assert(start);
3141 assert(end);
3142 SAVEFREEPV(newframe);
3143 newframe->next = regnext(scan);
3144 newframe->last = last;
3145 newframe->stop = stopparen;
3146 newframe->prev = frame;
3147
3148 frame = newframe;
3149 scan = start;
3150 stopparen = paren;
3151 last = end;
3152
3153 continue;
3154 }
3155 }
3156 else if (OP(scan) == EXACT) {
3157 I32 l = STR_LEN(scan);
3158 UV uc;
3159 if (UTF) {
3160 const U8 * const s = (U8*)STRING(scan);
3161 l = utf8_length(s, s + l);
3162 uc = utf8_to_uvchr(s, NULL);
3163 } else {
3164 uc = *((U8*)STRING(scan));
3165 }
3166 min += l;
3167 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3168 /* The code below prefers earlier match for fixed
3169 offset, later match for variable offset. */
3170 if (data->last_end == -1) { /* Update the start info. */
3171 data->last_start_min = data->pos_min;
3172 data->last_start_max = is_inf
3173 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3174 }
8aa23a47
YO
3175 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3176 if (UTF)
3177 SvUTF8_on(data->last_found);
3178 {
3179 SV * const sv = data->last_found;
3180 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3181 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3182 if (mg && mg->mg_len >= 0)
3183 mg->mg_len += utf8_length((U8*)STRING(scan),
3184 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3185 }
8aa23a47
YO
3186 data->last_end = data->pos_min + l;
3187 data->pos_min += l; /* As in the first entry. */
3188 data->flags &= ~SF_BEFORE_EOL;
3189 }
3190 if (flags & SCF_DO_STCLASS_AND) {
3191 /* Check whether it is compatible with what we know already! */
3192 int compat = 1;
3193
54251c2e 3194
486ec47a 3195 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3196 * in the bitmp and either 1) its bit or its fold is set, or 2)
3197 * it's for a locale. Even if there isn't unicode semantics
3198 * here, at runtime there may be because of matching against a
3199 * utf8 string, so accept a possible false positive for
3200 * latin1-range folds */
8aa23a47
YO
3201 if (uc >= 0x100 ||
3202 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3203 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3204 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3205 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3206 )
d18bf9dc 3207 {
8aa23a47 3208 compat = 0;
d18bf9dc 3209 }
8aa23a47
YO
3210 ANYOF_CLASS_ZERO(data->start_class);
3211 ANYOF_BITMAP_ZERO(data->start_class);
3212 if (compat)
3213 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3214 else if (uc >= 0x100) {
3215 int i;
3216
3217 /* Some Unicode code points fold to the Latin1 range; as
3218 * XXX temporary code, instead of figuring out if this is
3219 * one, just assume it is and set all the start class bits
3220 * that could be some such above 255 code point's fold
3221 * which will generate fals positives. As the code
3222 * elsewhere that does compute the fold settles down, it
3223 * can be extracted out and re-used here */
3224 for (i = 0; i < 256; i++){
3225 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3226 ANYOF_BITMAP_SET(data->start_class, i);
3227 }
3228 }
3229 }
8aa23a47
YO
3230 data->start_class->flags &= ~ANYOF_EOS;
3231 if (uc < 0x100)
3232 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3233 }
3234 else if (flags & SCF_DO_STCLASS_OR) {
3235 /* false positive possible if the class is case-folded */
3236 if (uc < 0x100)
3237 ANYOF_BITMAP_SET(data->start_class, uc);
3238 else
3239 data->start_class->flags |= ANYOF_UNICODE_ALL;
3240 data->start_class->flags &= ~ANYOF_EOS;
3241 cl_and(data->start_class, and_withp);
3242 }
3243 flags &= ~SCF_DO_STCLASS;
3244 }
3245 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3246 I32 l = STR_LEN(scan);
3247 UV uc = *((U8*)STRING(scan));
3248
3249 /* Search for fixed substrings supports EXACT only. */
3250 if (flags & SCF_DO_SUBSTR) {
3251 assert(data);
304ee84b 3252 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3253 }
3254 if (UTF) {
3255 const U8 * const s = (U8 *)STRING(scan);
3256 l = utf8_length(s, s + l);
3257 uc = utf8_to_uvchr(s, NULL);
3258 }
3259 min += l;
3260 if (flags & SCF_DO_SUBSTR)
3261 data->pos_min += l;
3262 if (flags & SCF_DO_STCLASS_AND) {
3263 /* Check whether it is compatible with what we know already! */
3264 int compat = 1;
8aa23a47 3265 if (uc >= 0x100 ||
54251c2e
KW
3266 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3267 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3268 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3269 {
8aa23a47 3270 compat = 0;
54251c2e 3271 }
8aa23a47
YO
3272 ANYOF_CLASS_ZERO(data->start_class);
3273 ANYOF_BITMAP_ZERO(data->start_class);
3274 if (compat) {
3275 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3276 data->start_class->flags &= ~ANYOF_EOS;
39065660 3277 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3278 if (OP(scan) == EXACTFL) {
af302e7f
KW
3279 /* XXX This set is probably no longer necessary, and
3280 * probably wrong as LOCALE now is on in the initial
3281 * state */
8aa23a47 3282 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3283 }
3284 else {
3285
54251c2e
KW
3286 /* Also set the other member of the fold pair. In case
3287 * that unicode semantics is called for at runtime, use
3288 * the full latin1 fold. (Can't do this for locale,
3289 * because not known until runtime */
3290 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3291 }
653099ff 3292 }
d18bf9dc
KW
3293 else if (uc >= 0x100) {
3294 int i;
3295 for (i = 0; i < 256; i++){
3296 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3297 ANYOF_BITMAP_SET(data->start_class, i);
3298 }
3299 }
3300 }
8aa23a47
YO
3301 }
3302 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3303 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3304 /* false positive possible if the class is case-folded.
3305 Assume that the locale settings are the same... */
970c8436 3306 if (uc < 0x100) {
1aa99e6b 3307 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3308 if (OP(scan) != EXACTFL) {
3309
3310 /* And set the other member of the fold pair, but
3311 * can't do that in locale because not known until
3312 * run-time */
3313 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3314 PL_fold_latin1[uc]);
970c8436
KW
3315 }
3316 }
653099ff
GS
3317 data->start_class->flags &= ~ANYOF_EOS;
3318 }
8aa23a47 3319 cl_and(data->start_class, and_withp);
653099ff 3320 }
8aa23a47
YO
3321 flags &= ~SCF_DO_STCLASS;
3322 }
e52fc539 3323 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3324 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3325 I32 f = flags, pos_before = 0;
3326 regnode * const oscan = scan;
3327 struct regnode_charclass_class this_class;
3328 struct regnode_charclass_class *oclass = NULL;
3329 I32 next_is_eval = 0;
3330
3331 switch (PL_regkind[OP(scan)]) {
3332 case WHILEM: /* End of (?:...)* . */
3333 scan = NEXTOPER(scan);
3334 goto finish;
3335 case PLUS:
3336 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3337 next = NEXTOPER(scan);
3338 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3339 mincount = 1;
3340 maxcount = REG_INFTY;
3341 next = regnext(scan);
3342 scan = NEXTOPER(scan);
3343 goto do_curly;
3344 }
3345 }
3346 if (flags & SCF_DO_SUBSTR)
3347 data->pos_min++;
3348 min++;
3349 /* Fall through. */
3350 case STAR:
3351 if (flags & SCF_DO_STCLASS) {
3352 mincount = 0;
3353 maxcount = REG_INFTY;
3354 next = regnext(scan);
3355 scan = NEXTOPER(scan);
3356 goto do_curly;
3357 }
3358 is_inf = is_inf_internal = 1;
3359 scan = regnext(scan);
c277df42 3360 if (flags & SCF_DO_SUBSTR) {
304ee84b 3361 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3362 data->longest = &(data->longest_float);
c277df42 3363 }
8aa23a47
YO
3364 goto optimize_curly_tail;
3365 case CURLY:
3366 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3367 && (scan->flags == stopparen))
3368 {
3369 mincount = 1;
3370 maxcount = 1;
3371 } else {
3372 mincount = ARG1(scan);
3373 maxcount = ARG2(scan);
653099ff 3374 }
8aa23a47
YO
3375 next = regnext(scan);
3376 if (OP(scan) == CURLYX) {
3377 I32 lp = (data ? *(data->last_closep) : 0);
3378 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3379 }
8aa23a47
YO
3380 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3381 next_is_eval = (OP(scan) == EVAL);
3382 do_curly:
3383 if (flags & SCF_DO_SUBSTR) {
304ee84b 3384 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3385 pos_before = data->pos_min;
b45f050a 3386 }
8aa23a47
YO
3387 if (data) {
3388 fl = data->flags;
3389 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3390 if (is_inf)
3391 data->flags |= SF_IS_INF;
3392 }
3393 if (flags & SCF_DO_STCLASS) {
e755fd73 3394 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3395 oclass = data->start_class;
3396 data->start_class = &this_class;
3397 f |= SCF_DO_STCLASS_AND;
3398 f &= ~SCF_DO_STCLASS_OR;
3399 }
779bcb7d
NC
3400 /* Exclude from super-linear cache processing any {n,m}
3401 regops for which the combination of input pos and regex
3402 pos is not enough information to determine if a match
3403 will be possible.
3404
3405 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3406 regex pos at the \s*, the prospects for a match depend not
3407 only on the input position but also on how many (bar\s*)
3408 repeats into the {4,8} we are. */
3409 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3410 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3411
8aa23a47
YO
3412 /* This will finish on WHILEM, setting scan, or on NULL: */
3413 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3414 last, data, stopparen, recursed, NULL,
3415 (mincount == 0
3416 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3417
8aa23a47
YO
3418 if (flags & SCF_DO_STCLASS)
3419 data->start_class = oclass;
3420 if (mincount == 0 || minnext == 0) {
3421 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3422 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3423 }
3424 else if (flags & SCF_DO_STCLASS_AND) {
3425 /* Switch to OR mode: cache the old value of
3426 * data->start_class */
3427 INIT_AND_WITHP;
3428 StructCopy(data->start_class, and_withp,
3429 struct regnode_charclass_class);
3430 flags &= ~SCF_DO_STCLASS_AND;
3431 StructCopy(&this_class, data->start_class,
3432 struct regnode_charclass_class);
3433 flags |= SCF_DO_STCLASS_OR;
3434 data->start_class->flags |= ANYOF_EOS;
3435 }
3436 } else { /* Non-zero len */
3437 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3438 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3439 cl_and(data->start_class, and_withp);
3440 }
3441 else if (flags & SCF_DO_STCLASS_AND)
3442 cl_and(data->start_class, &this_class);
3443 flags &= ~SCF_DO_STCLASS;
3444 }
3445 if (!scan) /* It was not CURLYX, but CURLY. */
3446 scan = next;
3447 if ( /* ? quantifier ok, except for (?{ ... }) */
3448 (next_is_eval || !(mincount == 0 && maxcount == 1))
3449 && (minnext == 0) && (deltanext == 0)
3450 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3451 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3452 {
668c081a
NC
3453 ckWARNreg(RExC_parse,
3454 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3455 }
3456
3457 min += minnext * mincount;
3458 is_inf_internal |= ((maxcount == REG_INFTY
3459 && (minnext + deltanext) > 0)
3460 || deltanext == I32_MAX);
3461 is_inf |= is_inf_internal;
3462 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3463
3464 /* Try powerful optimization CURLYX => CURLYN. */
3465 if ( OP(oscan) == CURLYX && data
3466 && data->flags & SF_IN_PAR
3467 && !(data->flags & SF_HAS_EVAL)
3468 && !deltanext && minnext == 1 ) {
3469 /* Try to optimize to CURLYN. */
3470 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3471 regnode * const nxt1 = nxt;
497b47a8 3472#ifdef DEBUGGING
8aa23a47 3473 regnode *nxt2;
497b47a8 3474#endif
c277df42 3475
8aa23a47
YO
3476 /* Skip open. */
3477 nxt = regnext(nxt);
e52fc539 3478 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3479 && !(PL_regkind[OP(nxt)] == EXACT
3480 && STR_LEN(nxt) == 1))
3481 goto nogo;
497b47a8 3482#ifdef DEBUGGING
8aa23a47 3483 nxt2 = nxt;
497b47a8 3484#endif
8aa23a47
YO
3485 nxt = regnext(nxt);
3486 if (OP(nxt) != CLOSE)
3487 goto nogo;
3488 if (RExC_open_parens) {
3489 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3490 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3491 }
3492 /* Now we know that nxt2 is the only contents: */
3493 oscan->flags = (U8)ARG(nxt);
3494 OP(oscan) = CURLYN;
3495 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3496
c277df42 3497#ifdef DEBUGGING
8aa23a47 3498 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3499 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3500 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3501 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3502 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3503 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3504#endif
8aa23a47
YO
3505 }
3506 nogo:
3507
3508 /* Try optimization CURLYX => CURLYM. */
3509 if ( OP(oscan) == CURLYX && data
3510 && !(data->flags & SF_HAS_PAR)
3511 && !(data->flags & SF_HAS_EVAL)
3512 && !deltanext /* atom is fixed width */
3513 && minnext != 0 /* CURLYM can't handle zero width */
3514 ) {
3515 /* XXXX How to optimize if data == 0? */
3516 /* Optimize to a simpler form. */
3517 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3518 regnode *nxt2;
3519
3520 OP(oscan) = CURLYM;
3521 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3522 && (OP(nxt2) != WHILEM))
3523 nxt = nxt2;
3524 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3525 /* Need to optimize away parenths. */
b3c0965f 3526 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3527 /* Set the parenth number. */
3528 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3529
8aa23a47
YO
3530 oscan->flags = (U8)ARG(nxt);
3531 if (RExC_open_parens) {
3532 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3533 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3534 }
8aa23a47
YO
3535 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3536 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3537
c277df42 3538#ifdef DEBUGGING
8aa23a47
YO
3539 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3540 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3541 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3542 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3543#endif
c277df42 3544#if 0
8aa23a47
YO
3545 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3546 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3547 if (nnxt == nxt) {
3548 if (reg_off_by_arg[OP(nxt1)])
3549 ARG_SET(nxt1, nxt2 - nxt1);
3550 else if (nxt2 - nxt1 < U16_MAX)
3551 NEXT_OFF(nxt1) = nxt2 - nxt1;
3552 else
3553 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3554 }
8aa23a47 3555 nxt1 = nnxt;
c277df42 3556 }
5d1c421c 3557#endif
8aa23a47
YO
3558 /* Optimize again: */
3559 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3560 NULL, stopparen, recursed, NULL, 0,depth+1);
3561 }
3562 else
3563 oscan->flags = 0;
3564 }
3565 else if ((OP(oscan) == CURLYX)
3566 && (flags & SCF_WHILEM_VISITED_POS)
3567 /* See the comment on a similar expression above.
3b753521 3568 However, this time it's not a subexpression
8aa23a47
YO
3569 we care about, but the expression itself. */
3570 && (maxcount == REG_INFTY)
3571 && data && ++data->whilem_c < 16) {
3572 /* This stays as CURLYX, we can put the count/of pair. */
3573 /* Find WHILEM (as in regexec.c) */
3574 regnode *nxt = oscan + NEXT_OFF(oscan);
3575
3576 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3577 nxt += ARG(nxt);
3578 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3579 | (RExC_whilem_seen << 4)); /* On WHILEM */
3580 }
3581 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3582 pars++;
3583 if (flags & SCF_DO_SUBSTR) {
3584 SV *last_str = NULL;
3585 int counted = mincount != 0;
a0ed51b3 3586
8aa23a47
YO
3587 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3588#if defined(SPARC64_GCC_WORKAROUND)
3589 I32 b = 0;
3590 STRLEN l = 0;
3591 const char *s = NULL;
3592 I32 old = 0;
b515a41d 3593
8aa23a47
YO
3594 if (pos_before >= data->last_start_min)
3595 b = pos_before;
3596 else
3597 b = data->last_start_min;
b515a41d 3598
8aa23a47
YO
3599 l = 0;
3600 s = SvPV_const(data->last_found, l);
3601 old = b - data->last_start_min;
3602
3603#else
3604 I32 b = pos_before >= data->last_start_min
3605 ? pos_before : data->last_start_min;
3606 STRLEN l;
3607 const char * const s = SvPV_const(data->last_found, l);
3608 I32 old = b - data->last_start_min;
3609#endif
3610
3611 if (UTF)
3612 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3613 l -= old;
3614 /* Get the added string: */
740cce10 3615 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3616 if (deltanext == 0 && pos_before == b) {
3617 /* What was added is a constant string */
3618 if (mincount > 1) {
3619 SvGROW(last_str, (mincount * l) + 1);
3620 repeatcpy(SvPVX(last_str) + l,
3621 SvPVX_const(last_str), l, mincount - 1);
3622 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3623 /* Add additional parts. */
3624 SvCUR_set(data->last_found,
3625 SvCUR(data->last_found) - l);
3626 sv_catsv(data->last_found, last_str);
3627 {
3628 SV * sv = data->last_found;
3629 MAGIC *mg =
3630 SvUTF8(sv) && SvMAGICAL(sv) ?
3631 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3632 if (mg && mg->mg_len >= 0)
bd94e887 3633 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3634 }
8aa23a47 3635 data->last_end += l * (mincount - 1);
b515a41d 3636 }
8aa23a47
YO
3637 } else {
3638 /* start offset must point into the last copy */
3639 data->last_start_min += minnext * (mincount - 1);
3640 data->last_start_max += is_inf ? I32_MAX
3641 : (maxcount - 1) * (minnext + data->pos_delta);
3642 }
c277df42 3643 }
8aa23a47
YO
3644 /* It is counted once already... */
3645 data->pos_min += minnext * (mincount - counted);
3646 data->pos_delta += - counted * deltanext +
3647 (minnext + deltanext) * maxcount - minnext * mincount;
3648 if (mincount != maxcount) {
3649 /* Cannot extend fixed substrings found inside
3650 the group. */
304ee84b 3651 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3652 if (mincount && last_str) {
3653 SV * const sv = data->last_found;
3654 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3655 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3656
3657 if (mg)
3658 mg->mg_len = -1;
3659 sv_setsv(sv, last_str);
3660 data->last_end = data->pos_min;
3661 data->last_start_min =
3662 data->pos_min - CHR_SVLEN(last_str);
3663 data->last_start_max = is_inf
3664 ? I32_MAX
3665 : data->pos_min + data->pos_delta
3666 - CHR_SVLEN(last_str);
3667 }
3668 data->longest = &(data->longest_float);
3669 }
3670 SvREFCNT_dec(last_str);
c277df42 3671 }
8aa23a47
YO
3672 if (data && (fl & SF_HAS_EVAL))
3673 data->flags |= SF_HAS_EVAL;
3674 optimize_curly_tail:
3675 if (OP(oscan) != CURLYX) {
3676 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING