This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: ANYOF node handle range to UV_MAX
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
830247a4
IZ
145#if ADD_TO_REGEXEC
146 char *starttry; /* -Dr: where regtry was called. */
147#define RExC_starttry (pRExC_state->starttry)
148#endif
3dab1dad 149#ifdef DEBUGGING
be8e71aa 150 const char *lastparse;
3dab1dad 151 I32 lastnum;
1f1031fe 152 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
153#define RExC_lastparse (pRExC_state->lastparse)
154#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 155#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 156#endif
830247a4
IZ
157} RExC_state_t;
158
e2509266 159#define RExC_flags (pRExC_state->flags)
830247a4 160#define RExC_precomp (pRExC_state->precomp)
288b8c02 161#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 162#define RExC_rx (pRExC_state->rx)
f8fc2ecf 163#define RExC_rxi (pRExC_state->rxi)
fac92740 164#define RExC_start (pRExC_state->start)
830247a4
IZ
165#define RExC_end (pRExC_state->end)
166#define RExC_parse (pRExC_state->parse)
167#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
168#ifdef RE_TRACK_PATTERN_OFFSETS
169#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
170#endif
830247a4 171#define RExC_emit (pRExC_state->emit)
fac92740 172#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 173#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
174#define RExC_naughty (pRExC_state->naughty)
175#define RExC_sawback (pRExC_state->sawback)
176#define RExC_seen (pRExC_state->seen)
177#define RExC_size (pRExC_state->size)
178#define RExC_npar (pRExC_state->npar)
e2e6a0f1 179#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
180#define RExC_extralen (pRExC_state->extralen)
181#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
182#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 183#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 184#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 185#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
186#define RExC_open_parens (pRExC_state->open_parens)
187#define RExC_close_parens (pRExC_state->close_parens)
188#define RExC_opend (pRExC_state->opend)
81714fb9 189#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
190#define RExC_recurse (pRExC_state->recurse)
191#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 192#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
830247a4 193
cde0cee5 194
a687059c
LW
195#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
196#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
197 ((*s) == '{' && regcurly(s)))
a687059c 198
35c8bce7
LW
199#ifdef SPSTART
200#undef SPSTART /* dratted cpp namespace... */
201#endif
a687059c
LW
202/*
203 * Flags to be passed up and down.
204 */
a687059c 205#define WORST 0 /* Worst case. */
a3b492c3 206#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
207
208/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 209 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 210#define SIMPLE 0x02
a3b492c3
YO
211#define SPSTART 0x04 /* Starts with * or +. */
212#define TRYAGAIN 0x08 /* Weeded out a declaration. */
213#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 214
3dab1dad
YO
215#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
216
07be1b83
YO
217/* whether trie related optimizations are enabled */
218#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
219#define TRIE_STUDY_OPT
786e8c11 220#define FULL_TRIE_STUDY
07be1b83
YO
221#define TRIE_STCLASS
222#endif
1de06328
YO
223
224
40d049e4
YO
225
226#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
227#define PBITVAL(paren) (1 << ((paren) & 7))
228#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
229#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
230#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
231
bbd61b5f
KW
232/* If not already in utf8, do a longjmp back to the beginning */
233#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
234#define REQUIRE_UTF8 STMT_START { \
235 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
236 } STMT_END
40d049e4 237
1de06328
YO
238/* About scan_data_t.
239
240 During optimisation we recurse through the regexp program performing
241 various inplace (keyhole style) optimisations. In addition study_chunk
242 and scan_commit populate this data structure with information about
243 what strings MUST appear in the pattern. We look for the longest
3b753521 244 string that must appear at a fixed location, and we look for the
1de06328
YO
245 longest string that may appear at a floating location. So for instance
246 in the pattern:
247
248 /FOO[xX]A.*B[xX]BAR/
249
250 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
251 strings (because they follow a .* construct). study_chunk will identify
252 both FOO and BAR as being the longest fixed and floating strings respectively.
253
254 The strings can be composites, for instance
255
256 /(f)(o)(o)/
257
258 will result in a composite fixed substring 'foo'.
259
260 For each string some basic information is maintained:
261
262 - offset or min_offset
263 This is the position the string must appear at, or not before.
264 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
265 characters must match before the string we are searching for.
266 Likewise when combined with minlenp and the length of the string it
1de06328
YO
267 tells us how many characters must appear after the string we have
268 found.
269
270 - max_offset
271 Only used for floating strings. This is the rightmost point that
3b753521 272 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
273 string can occur infinitely far to the right.
274
275 - minlenp
276 A pointer to the minimum length of the pattern that the string
277 was found inside. This is important as in the case of positive
278 lookahead or positive lookbehind we can have multiple patterns
279 involved. Consider
280
281 /(?=FOO).*F/
282
283 The minimum length of the pattern overall is 3, the minimum length
284 of the lookahead part is 3, but the minimum length of the part that
285 will actually match is 1. So 'FOO's minimum length is 3, but the
286 minimum length for the F is 1. This is important as the minimum length
287 is used to determine offsets in front of and behind the string being
288 looked for. Since strings can be composites this is the length of the
486ec47a 289 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
290 the length is calculated by study_chunk, so that the minimum lengths
291 are not known until the full pattern has been compiled, thus the
292 pointer to the value.
293
294 - lookbehind
295
296 In the case of lookbehind the string being searched for can be
297 offset past the start point of the final matching string.
298 If this value was just blithely removed from the min_offset it would
299 invalidate some of the calculations for how many chars must match
300 before or after (as they are derived from min_offset and minlen and
301 the length of the string being searched for).
302 When the final pattern is compiled and the data is moved from the
303 scan_data_t structure into the regexp structure the information
304 about lookbehind is factored in, with the information that would
305 have been lost precalculated in the end_shift field for the
306 associated string.
307
308 The fields pos_min and pos_delta are used to store the minimum offset
309 and the delta to the maximum offset at the current point in the pattern.
310
311*/
2c2d71f5
JH
312
313typedef struct scan_data_t {
1de06328
YO
314 /*I32 len_min; unused */
315 /*I32 len_delta; unused */
2c2d71f5
JH
316 I32 pos_min;
317 I32 pos_delta;
318 SV *last_found;
1de06328 319 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
320 I32 last_start_min;
321 I32 last_start_max;
1de06328
YO
322 SV **longest; /* Either &l_fixed, or &l_float. */
323 SV *longest_fixed; /* longest fixed string found in pattern */
324 I32 offset_fixed; /* offset where it starts */
486ec47a 325 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
326 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
327 SV *longest_float; /* longest floating string found in pattern */
328 I32 offset_float_min; /* earliest point in string it can appear */
329 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 330 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 331 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
332 I32 flags;
333 I32 whilem_c;
cb434fcc 334 I32 *last_closep;
653099ff 335 struct regnode_charclass_class *start_class;
2c2d71f5
JH
336} scan_data_t;
337
a687059c 338/*
e50aee73 339 * Forward declarations for pregcomp()'s friends.
a687059c 340 */
a0d0e21e 341
27da23d5 342static const scan_data_t zero_scan_data =
1de06328 343 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
344
345#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
346#define SF_BEFORE_SEOL 0x0001
347#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
348#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
349#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
350
09b7f37c
CB
351#ifdef NO_UNARY_PLUS
352# define SF_FIX_SHIFT_EOL (0+2)
353# define SF_FL_SHIFT_EOL (0+4)
354#else
355# define SF_FIX_SHIFT_EOL (+2)
356# define SF_FL_SHIFT_EOL (+4)
357#endif
c277df42
IZ
358
359#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
360#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
361
362#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
363#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
364#define SF_IS_INF 0x0040
365#define SF_HAS_PAR 0x0080
366#define SF_IN_PAR 0x0100
367#define SF_HAS_EVAL 0x0200
368#define SCF_DO_SUBSTR 0x0400
653099ff
GS
369#define SCF_DO_STCLASS_AND 0x0800
370#define SCF_DO_STCLASS_OR 0x1000
371#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 372#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 373
786e8c11 374#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 375#define SCF_SEEN_ACCEPT 0x8000
07be1b83 376
43fead97 377#define UTF cBOOL(RExC_utf8)
a62b1201
KW
378#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
379#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
380#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
381#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
382#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
383#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
384#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 385
43fead97 386#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 387
ffc61ed2 388#define OOB_UNICODE 12345678
93733859 389#define OOB_NAMEDCLASS -1
b8c5462f 390
a0ed51b3
LW
391#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
392#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
393
8615cb43 394
b45f050a
JF
395/* length of regex to show in messages that don't mark a position within */
396#define RegexLengthToShowInErrorMessages 127
397
398/*
399 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
400 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
401 * op/pragma/warn/regcomp.
402 */
7253e4e3
RK
403#define MARKER1 "<-- HERE" /* marker as it appears in the description */
404#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 405
7253e4e3 406#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
407
408/*
409 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
410 * arg. Show regex, up to a maximum length. If it's too long, chop and add
411 * "...".
412 */
58e23c8d 413#define _FAIL(code) STMT_START { \
bfed75c6 414 const char *ellipses = ""; \
ccb2c380
MP
415 IV len = RExC_end - RExC_precomp; \
416 \
417 if (!SIZE_ONLY) \
288b8c02 418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
419 if (len > RegexLengthToShowInErrorMessages) { \
420 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
421 len = RegexLengthToShowInErrorMessages - 10; \
422 ellipses = "..."; \
423 } \
58e23c8d 424 code; \
ccb2c380 425} STMT_END
8615cb43 426
58e23c8d
YO
427#define FAIL(msg) _FAIL( \
428 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
429 msg, (int)len, RExC_precomp, ellipses))
430
431#define FAIL2(msg,arg) _FAIL( \
432 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
433 arg, (int)len, RExC_precomp, ellipses))
434
b45f050a 435/*
b45f050a
JF
436 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
437 */
ccb2c380 438#define Simple_vFAIL(m) STMT_START { \
a28509cc 439 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
440 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
441 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
442} STMT_END
b45f050a
JF
443
444/*
445 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
446 */
ccb2c380
MP
447#define vFAIL(m) STMT_START { \
448 if (!SIZE_ONLY) \
288b8c02 449 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
450 Simple_vFAIL(m); \
451} STMT_END
b45f050a
JF
452
453/*
454 * Like Simple_vFAIL(), but accepts two arguments.
455 */
ccb2c380 456#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 457 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
458 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
459 (int)offset, RExC_precomp, RExC_precomp + offset); \
460} STMT_END
b45f050a
JF
461
462/*
463 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
464 */
ccb2c380
MP
465#define vFAIL2(m,a1) STMT_START { \
466 if (!SIZE_ONLY) \
288b8c02 467 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
468 Simple_vFAIL2(m, a1); \
469} STMT_END
b45f050a
JF
470
471
472/*
473 * Like Simple_vFAIL(), but accepts three arguments.
474 */
ccb2c380 475#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 476 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
477 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
478 (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
b45f050a
JF
480
481/*
482 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
483 */
ccb2c380
MP
484#define vFAIL3(m,a1,a2) STMT_START { \
485 if (!SIZE_ONLY) \
288b8c02 486 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
487 Simple_vFAIL3(m, a1, a2); \
488} STMT_END
b45f050a
JF
489
490/*
491 * Like Simple_vFAIL(), but accepts four arguments.
492 */
ccb2c380 493#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 494 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
b45f050a 498
668c081a 499#define ckWARNreg(loc,m) STMT_START { \
a28509cc 500 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
501 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
503} STMT_END
504
668c081a 505#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 506 const IV offset = loc - RExC_precomp; \
d1d15184 507 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
508 m REPORT_LOCATION, \
509 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
510} STMT_END
511
668c081a 512#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 513 const IV offset = loc - RExC_precomp; \
668c081a 514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
515 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
516} STMT_END
517
518#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 519 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a
NC
524#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
ccb2c380 530#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
535
668c081a
NC
536#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
537 const IV offset = loc - RExC_precomp; \
538 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
539 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
540} STMT_END
541
ccb2c380 542#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 543 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
544 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
545 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
546} STMT_END
9d1d55b5 547
8615cb43 548
cd439c50 549/* Allow for side effects in s */
ccb2c380
MP
550#define REGC(c,s) STMT_START { \
551 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
552} STMT_END
cd439c50 553
fac92740
MJD
554/* Macros for recording node offsets. 20001227 mjd@plover.com
555 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
556 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
557 * Element 0 holds the number n.
07be1b83 558 * Position is 1 indexed.
fac92740 559 */
7122b237
YO
560#ifndef RE_TRACK_PATTERN_OFFSETS
561#define Set_Node_Offset_To_R(node,byte)
562#define Set_Node_Offset(node,byte)
563#define Set_Cur_Node_Offset
564#define Set_Node_Length_To_R(node,len)
565#define Set_Node_Length(node,len)
566#define Set_Node_Cur_Length(node)
567#define Node_Offset(n)
568#define Node_Length(n)
569#define Set_Node_Offset_Length(node,offset,len)
570#define ProgLen(ri) ri->u.proglen
571#define SetProgLen(ri,x) ri->u.proglen = x
572#else
573#define ProgLen(ri) ri->u.offsets[0]
574#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
575#define Set_Node_Offset_To_R(node,byte) STMT_START { \
576 if (! SIZE_ONLY) { \
577 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 578 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 579 if((node) < 0) { \
551405c4 580 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
581 } else { \
582 RExC_offsets[2*(node)-1] = (byte); \
583 } \
584 } \
585} STMT_END
586
587#define Set_Node_Offset(node,byte) \
588 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
589#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
590
591#define Set_Node_Length_To_R(node,len) STMT_START { \
592 if (! SIZE_ONLY) { \
593 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 594 __LINE__, (int)(node), (int)(len))); \
ccb2c380 595 if((node) < 0) { \
551405c4 596 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
597 } else { \
598 RExC_offsets[2*(node)] = (len); \
599 } \
600 } \
601} STMT_END
602
603#define Set_Node_Length(node,len) \
604 Set_Node_Length_To_R((node)-RExC_emit_start, len)
605#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
606#define Set_Node_Cur_Length(node) \
607 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
608
609/* Get offsets and lengths */
610#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
611#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
612
07be1b83
YO
613#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
614 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
615 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
616} STMT_END
7122b237 617#endif
07be1b83
YO
618
619#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
620#define EXPERIMENTAL_INPLACESCAN
f427392e 621#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 622
304ee84b
YO
623#define DEBUG_STUDYDATA(str,data,depth) \
624DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 625 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
626 "%*s" str "Pos:%"IVdf"/%"IVdf \
627 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
628 (int)(depth)*2, "", \
629 (IV)((data)->pos_min), \
630 (IV)((data)->pos_delta), \
304ee84b 631 (UV)((data)->flags), \
1de06328 632 (IV)((data)->whilem_c), \
304ee84b
YO
633 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
634 is_inf ? "INF " : "" \
1de06328
YO
635 ); \
636 if ((data)->last_found) \
637 PerlIO_printf(Perl_debug_log, \
638 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
639 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
640 SvPVX_const((data)->last_found), \
641 (IV)((data)->last_end), \
642 (IV)((data)->last_start_min), \
643 (IV)((data)->last_start_max), \
644 ((data)->longest && \
645 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
646 SvPVX_const((data)->longest_fixed), \
647 (IV)((data)->offset_fixed), \
648 ((data)->longest && \
649 (data)->longest==&((data)->longest_float)) ? "*" : "", \
650 SvPVX_const((data)->longest_float), \
651 (IV)((data)->offset_float_min), \
652 (IV)((data)->offset_float_max) \
653 ); \
654 PerlIO_printf(Perl_debug_log,"\n"); \
655});
656
acfe0abc 657static void clear_re(pTHX_ void *r);
4327152a 658
653099ff 659/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 660 Update the longest found anchored substring and the longest found
653099ff
GS
661 floating substrings if needed. */
662
4327152a 663STATIC void
304ee84b 664S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 665{
e1ec3a88
AL
666 const STRLEN l = CHR_SVLEN(data->last_found);
667 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 668 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 669
7918f24d
NC
670 PERL_ARGS_ASSERT_SCAN_COMMIT;
671
c277df42 672 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 673 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
674 if (*data->longest == data->longest_fixed) {
675 data->offset_fixed = l ? data->last_start_min : data->pos_min;
676 if (data->flags & SF_BEFORE_EOL)
b81d288d 677 data->flags
c277df42
IZ
678 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
679 else
680 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
681 data->minlen_fixed=minlenp;
682 data->lookbehind_fixed=0;
a0ed51b3 683 }
304ee84b 684 else { /* *data->longest == data->longest_float */
c277df42 685 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
686 data->offset_float_max = (l
687 ? data->last_start_max
c277df42 688 : data->pos_min + data->pos_delta);
304ee84b 689 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 690 data->offset_float_max = I32_MAX;
c277df42 691 if (data->flags & SF_BEFORE_EOL)
b81d288d 692 data->flags
c277df42
IZ
693 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
694 else
695 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
696 data->minlen_float=minlenp;
697 data->lookbehind_float=0;
c277df42
IZ
698 }
699 }
700 SvCUR_set(data->last_found, 0);
0eda9292 701 {
a28509cc 702 SV * const sv = data->last_found;
097eb12c
AL
703 if (SvUTF8(sv) && SvMAGICAL(sv)) {
704 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
705 if (mg)
706 mg->mg_len = 0;
707 }
0eda9292 708 }
c277df42
IZ
709 data->last_end = -1;
710 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 711 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
712}
713
653099ff
GS
714/* Can match anything (initialization) */
715STATIC void
097eb12c 716S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 717{
7918f24d
NC
718 PERL_ARGS_ASSERT_CL_ANYTHING;
719
653099ff 720 ANYOF_CLASS_ZERO(cl);
f8bef550 721 ANYOF_BITMAP_SETALL(cl);
11454c59 722 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
653099ff
GS
723 if (LOC)
724 cl->flags |= ANYOF_LOCALE;
725}
726
727/* Can match anything (initialization) */
728STATIC int
5f66b61c 729S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
730{
731 int value;
732
7918f24d
NC
733 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
734
aaa51d5e 735 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
736 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
737 return 1;
1aa99e6b
IH
738 if (!(cl->flags & ANYOF_UNICODE_ALL))
739 return 0;
10edeb5d 740 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 741 return 0;
653099ff
GS
742 return 1;
743}
744
745/* Can match anything (initialization) */
746STATIC void
097eb12c 747S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 748{
7918f24d
NC
749 PERL_ARGS_ASSERT_CL_INIT;
750
8ecf7187 751 Zero(cl, 1, struct regnode_charclass_class);
653099ff 752 cl->type = ANYOF;
830247a4 753 cl_anything(pRExC_state, cl);
653099ff
GS
754}
755
756STATIC void
097eb12c 757S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 758{
7918f24d
NC
759 PERL_ARGS_ASSERT_CL_INIT_ZERO;
760
8ecf7187 761 Zero(cl, 1, struct regnode_charclass_class);
653099ff 762 cl->type = ANYOF;
830247a4 763 cl_anything(pRExC_state, cl);
653099ff
GS
764 if (LOC)
765 cl->flags |= ANYOF_LOCALE;
766}
767
768/* 'And' a given class with another one. Can create false positives */
769/* We assume that cl is not inverted */
770STATIC void
5f66b61c 771S_cl_and(struct regnode_charclass_class *cl,
a28509cc 772 const struct regnode_charclass_class *and_with)
653099ff 773{
7918f24d 774 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
775
776 assert(and_with->type == ANYOF);
1e6ade67
KW
777
778 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
779 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 780 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
781 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
782 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
783 int i;
784
785 if (and_with->flags & ANYOF_INVERT)
786 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
787 cl->bitmap[i] &= ~and_with->bitmap[i];
788 else
789 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
790 cl->bitmap[i] &= and_with->bitmap[i];
791 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
792 if (!(and_with->flags & ANYOF_EOS))
793 cl->flags &= ~ANYOF_EOS;
1aa99e6b 794
39065660
KW
795 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
796 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
11454c59
KW
797 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
798 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
8951c461 799
4713bfe1
KW
800 if (cl->flags & ANYOF_UNICODE_ALL
801 && and_with->flags & ANYOF_NONBITMAP
802 && !(and_with->flags & ANYOF_INVERT))
803 {
804 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
805 cl->flags &= ~ANYOF_UNICODE_ALL;
806 }
ef87b810
KW
807 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
808 only the one(s)
809 actually set */
1aa99e6b
IH
810 ARG_SET(cl, ARG(and_with));
811 }
14ebb1a2
JH
812 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
813 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 814 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 815 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 816 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 817 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
818}
819
820/* 'OR' a given class with another one. Can create false positives */
821/* We assume that cl is not inverted */
822STATIC void
097eb12c 823S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 824{
7918f24d
NC
825 PERL_ARGS_ASSERT_CL_OR;
826
653099ff
GS
827 if (or_with->flags & ANYOF_INVERT) {
828 /* We do not use
829 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
830 * <= (B1 | !B2) | (CL1 | !CL2)
831 * which is wasteful if CL2 is small, but we ignore CL2:
832 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
833 * XXXX Can we handle case-fold? Unclear:
834 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
835 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
836 */
837 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
838 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
839 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
840 int i;
841
842 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
843 cl->bitmap[i] |= ~or_with->bitmap[i];
844 } /* XXXX: logic is complicated otherwise */
845 else {
830247a4 846 cl_anything(pRExC_state, cl);
653099ff
GS
847 }
848 } else {
849 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
850 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
851 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
852 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
853 int i;
854
855 /* OR char bitmap and class bitmap separately */
856 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
857 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 858 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
859 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
860 cl->classflags[i] |= or_with->classflags[i];
861 cl->flags |= ANYOF_CLASS;
862 }
863 }
864 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 865 cl_anything(pRExC_state, cl);
653099ff
GS
866 }
867 }
868 if (or_with->flags & ANYOF_EOS)
869 cl->flags |= ANYOF_EOS;
11454c59
KW
870 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
871 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
1aa99e6b 872
39065660
KW
873 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
874 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
8951c461 875
9826f543
KW
876 /* If both nodes match something outside the bitmap, but what they match
877 * outside is not the same pointer, and hence not easily compared, give up
878 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 879 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
880 ARG(cl) != ARG(or_with)) {
881 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 882 }
9826f543 883
1aa99e6b
IH
884 if (or_with->flags & ANYOF_UNICODE_ALL) {
885 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 886 }
653099ff
GS
887}
888
a3621e74
YO
889#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
890#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
891#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
892#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
893
3dab1dad
YO
894
895#ifdef DEBUGGING
07be1b83 896/*
2b8b4781
NC
897 dump_trie(trie,widecharmap,revcharmap)
898 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
899 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
900
901 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
902 The _interim_ variants are used for debugging the interim
903 tables that are used to generate the final compressed
904 representation which is what dump_trie expects.
905
486ec47a 906 Part of the reason for their existence is to provide a form
3dab1dad 907 of documentation as to how the different representations function.
07be1b83
YO
908
909*/
3dab1dad
YO
910
911/*
3dab1dad
YO
912 Dumps the final compressed table form of the trie to Perl_debug_log.
913 Used for debugging make_trie().
914*/
b9a59e08 915
3dab1dad 916STATIC void
2b8b4781
NC
917S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
918 AV *revcharmap, U32 depth)
3dab1dad
YO
919{
920 U32 state;
ab3bbdeb 921 SV *sv=sv_newmortal();
55eed653 922 int colwidth= widecharmap ? 6 : 4;
2e64971a 923 U16 word;
3dab1dad
YO
924 GET_RE_DEBUG_FLAGS_DECL;
925
7918f24d 926 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 927
3dab1dad
YO
928 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
929 (int)depth * 2 + 2,"",
930 "Match","Base","Ofs" );
931
932 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 933 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 934 if ( tmp ) {
ab3bbdeb
YO
935 PerlIO_printf( Perl_debug_log, "%*s",
936 colwidth,
ddc5bc0f 937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
941 )
942 );
3dab1dad
YO
943 }
944 }
945 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
946 (int)depth * 2 + 2,"");
947
948 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 949 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
950 PerlIO_printf( Perl_debug_log, "\n");
951
1e2e3d02 952 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 953 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
954
955 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
956
957 if ( trie->states[ state ].wordnum ) {
958 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
959 } else {
960 PerlIO_printf( Perl_debug_log, "%6s", "" );
961 }
962
963 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
964
965 if ( base ) {
966 U32 ofs = 0;
967
968 while( ( base + ofs < trie->uniquecharcount ) ||
969 ( base + ofs - trie->uniquecharcount < trie->lasttrans
970 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
971 ofs++;
972
973 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
974
975 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
976 if ( ( base + ofs >= trie->uniquecharcount ) &&
977 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
978 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
979 {
ab3bbdeb
YO
980 PerlIO_printf( Perl_debug_log, "%*"UVXf,
981 colwidth,
3dab1dad
YO
982 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
983 } else {
ab3bbdeb 984 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
985 }
986 }
987
988 PerlIO_printf( Perl_debug_log, "]");
989
990 }
991 PerlIO_printf( Perl_debug_log, "\n" );
992 }
2e64971a
DM
993 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
994 for (word=1; word <= trie->wordcount; word++) {
995 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
996 (int)word, (int)(trie->wordinfo[word].prev),
997 (int)(trie->wordinfo[word].len));
998 }
999 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1000}
1001/*
3dab1dad
YO
1002 Dumps a fully constructed but uncompressed trie in list form.
1003 List tries normally only are used for construction when the number of
1004 possible chars (trie->uniquecharcount) is very high.
1005 Used for debugging make_trie().
1006*/
1007STATIC void
55eed653 1008S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1009 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1010 U32 depth)
3dab1dad
YO
1011{
1012 U32 state;
ab3bbdeb 1013 SV *sv=sv_newmortal();
55eed653 1014 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1015 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1016
1017 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1018
3dab1dad 1019 /* print out the table precompression. */
ab3bbdeb
YO
1020 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1021 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1022 "------:-----+-----------------\n" );
3dab1dad
YO
1023
1024 for( state=1 ; state < next_alloc ; state ++ ) {
1025 U16 charid;
1026
ab3bbdeb 1027 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1028 (int)depth * 2 + 2,"", (UV)state );
1029 if ( ! trie->states[ state ].wordnum ) {
1030 PerlIO_printf( Perl_debug_log, "%5s| ","");
1031 } else {
1032 PerlIO_printf( Perl_debug_log, "W%4x| ",
1033 trie->states[ state ].wordnum
1034 );
1035 }
1036 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1037 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1038 if ( tmp ) {
1039 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1040 colwidth,
ddc5bc0f 1041 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1042 PL_colors[0], PL_colors[1],
1043 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1044 PERL_PV_ESCAPE_FIRSTCHAR
1045 ) ,
1e2e3d02
YO
1046 TRIE_LIST_ITEM(state,charid).forid,
1047 (UV)TRIE_LIST_ITEM(state,charid).newstate
1048 );
1049 if (!(charid % 10))
664e119d
RGS
1050 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1051 (int)((depth * 2) + 14), "");
1e2e3d02 1052 }
ab3bbdeb
YO
1053 }
1054 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1055 }
1056}
1057
1058/*
3dab1dad
YO
1059 Dumps a fully constructed but uncompressed trie in table form.
1060 This is the normal DFA style state transition table, with a few
1061 twists to facilitate compression later.
1062 Used for debugging make_trie().
1063*/
1064STATIC void
55eed653 1065S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1066 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1067 U32 depth)
3dab1dad
YO
1068{
1069 U32 state;
1070 U16 charid;
ab3bbdeb 1071 SV *sv=sv_newmortal();
55eed653 1072 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1073 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1074
1075 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1076
1077 /*
1078 print out the table precompression so that we can do a visual check
1079 that they are identical.
1080 */
1081
1082 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1083
1084 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1085 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1086 if ( tmp ) {
ab3bbdeb
YO
1087 PerlIO_printf( Perl_debug_log, "%*s",
1088 colwidth,
ddc5bc0f 1089 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1090 PL_colors[0], PL_colors[1],
1091 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1092 PERL_PV_ESCAPE_FIRSTCHAR
1093 )
1094 );
3dab1dad
YO
1095 }
1096 }
1097
1098 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1099
1100 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1101 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1102 }
1103
1104 PerlIO_printf( Perl_debug_log, "\n" );
1105
1106 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1107
1108 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1109 (int)depth * 2 + 2,"",
1110 (UV)TRIE_NODENUM( state ) );
1111
1112 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1113 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1114 if (v)
1115 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1116 else
1117 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1118 }
1119 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1120 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1121 } else {
1122 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1123 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1124 }
1125 }
07be1b83 1126}
3dab1dad
YO
1127
1128#endif
1129
2e64971a 1130
786e8c11
YO
1131/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1132 startbranch: the first branch in the whole branch sequence
1133 first : start branch of sequence of branch-exact nodes.
1134 May be the same as startbranch
1135 last : Thing following the last branch.
1136 May be the same as tail.
1137 tail : item following the branch sequence
1138 count : words in the sequence
1139 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1140 depth : indent depth
3dab1dad 1141
786e8c11 1142Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1143
786e8c11
YO
1144A trie is an N'ary tree where the branches are determined by digital
1145decomposition of the key. IE, at the root node you look up the 1st character and
1146follow that branch repeat until you find the end of the branches. Nodes can be
1147marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1148
786e8c11 1149 /he|she|his|hers/
72f13be8 1150
786e8c11
YO
1151would convert into the following structure. Numbers represent states, letters
1152following numbers represent valid transitions on the letter from that state, if
1153the number is in square brackets it represents an accepting state, otherwise it
1154will be in parenthesis.
07be1b83 1155
786e8c11
YO
1156 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1157 | |
1158 | (2)
1159 | |
1160 (1) +-i->(6)-+-s->[7]
1161 |
1162 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1163
786e8c11
YO
1164 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1165
1166This shows that when matching against the string 'hers' we will begin at state 1
1167read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1168then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1169is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1170single traverse. We store a mapping from accepting to state to which word was
1171matched, and then when we have multiple possibilities we try to complete the
1172rest of the regex in the order in which they occured in the alternation.
1173
1174The only prior NFA like behaviour that would be changed by the TRIE support is
1175the silent ignoring of duplicate alternations which are of the form:
1176
1177 / (DUPE|DUPE) X? (?{ ... }) Y /x
1178
4b714af6 1179Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1180and without the optimisation. With the optimisations dupes will be silently
486ec47a 1181ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1182the following demonstrates:
1183
1184 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1185
1186which prints out 'word' three times, but
1187
1188 'words'=~/(word|word|word)(?{ print $1 })S/
1189
1190which doesnt print it out at all. This is due to other optimisations kicking in.
1191
1192Example of what happens on a structural level:
1193
486ec47a 1194The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1195
1196 1: CURLYM[1] {1,32767}(18)
1197 5: BRANCH(8)
1198 6: EXACT <ac>(16)
1199 8: BRANCH(11)
1200 9: EXACT <ad>(16)
1201 11: BRANCH(14)
1202 12: EXACT <ab>(16)
1203 16: SUCCEED(0)
1204 17: NOTHING(18)
1205 18: END(0)
1206
1207This would be optimizable with startbranch=5, first=5, last=16, tail=16
1208and should turn into:
1209
1210 1: CURLYM[1] {1,32767}(18)
1211 5: TRIE(16)
1212 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1213 <ac>
1214 <ad>
1215 <ab>
1216 16: SUCCEED(0)
1217 17: NOTHING(18)
1218 18: END(0)
1219
1220Cases where tail != last would be like /(?foo|bar)baz/:
1221
1222 1: BRANCH(4)
1223 2: EXACT <foo>(8)
1224 4: BRANCH(7)
1225 5: EXACT <bar>(8)
1226 7: TAIL(8)
1227 8: EXACT <baz>(10)
1228 10: END(0)
1229
1230which would be optimizable with startbranch=1, first=1, last=7, tail=8
1231and would end up looking like:
1232
1233 1: TRIE(8)
1234 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1235 <foo>
1236 <bar>
1237 7: TAIL(8)
1238 8: EXACT <baz>(10)
1239 10: END(0)
1240
1241 d = uvuni_to_utf8_flags(d, uv, 0);
1242
1243is the recommended Unicode-aware way of saying
1244
1245 *(d++) = uv;
1246*/
1247
1e2e3d02 1248#define TRIE_STORE_REVCHAR \
786e8c11 1249 STMT_START { \
73031816
NC
1250 if (UTF) { \
1251 SV *zlopp = newSV(2); \
88c9ea1e
CB
1252 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1253 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1254 SvCUR_set(zlopp, kapow - flrbbbbb); \
1255 SvPOK_on(zlopp); \
1256 SvUTF8_on(zlopp); \
1257 av_push(revcharmap, zlopp); \
1258 } else { \
6bdeddd2 1259 char ooooff = (char)uvc; \
73031816
NC
1260 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1261 } \
1262 } STMT_END
786e8c11
YO
1263
1264#define TRIE_READ_CHAR STMT_START { \
1265 wordlen++; \
1266 if ( UTF ) { \
1267 if ( folder ) { \
1268 if ( foldlen > 0 ) { \
1269 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1270 foldlen -= len; \
1271 scan += len; \
1272 len = 0; \
1273 } else { \
1274 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1275 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1276 foldlen -= UNISKIP( uvc ); \
1277 scan = foldbuf + UNISKIP( uvc ); \
1278 } \
1279 } else { \
1280 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1281 } \
1282 } else { \
1283 uvc = (U32)*uc; \
1284 len = 1; \
1285 } \
1286} STMT_END
1287
1288
1289
1290#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1291 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1292 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1293 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1294 } \
1295 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1296 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1297 TRIE_LIST_CUR( state )++; \
1298} STMT_END
07be1b83 1299
786e8c11
YO
1300#define TRIE_LIST_NEW(state) STMT_START { \
1301 Newxz( trie->states[ state ].trans.list, \
1302 4, reg_trie_trans_le ); \
1303 TRIE_LIST_CUR( state ) = 1; \
1304 TRIE_LIST_LEN( state ) = 4; \
1305} STMT_END
07be1b83 1306
786e8c11
YO
1307#define TRIE_HANDLE_WORD(state) STMT_START { \
1308 U16 dupe= trie->states[ state ].wordnum; \
1309 regnode * const noper_next = regnext( noper ); \
1310 \
786e8c11
YO
1311 DEBUG_r({ \
1312 /* store the word for dumping */ \
1313 SV* tmp; \
1314 if (OP(noper) != NOTHING) \
740cce10 1315 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1316 else \
740cce10 1317 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1318 av_push( trie_words, tmp ); \
786e8c11
YO
1319 }); \
1320 \
1321 curword++; \
2e64971a
DM
1322 trie->wordinfo[curword].prev = 0; \
1323 trie->wordinfo[curword].len = wordlen; \
1324 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1325 \
1326 if ( noper_next < tail ) { \
1327 if (!trie->jump) \
c944940b 1328 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1329 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1330 if (!jumper) \
1331 jumper = noper_next; \
1332 if (!nextbranch) \
1333 nextbranch= regnext(cur); \
1334 } \
1335 \
1336 if ( dupe ) { \
2e64971a
DM
1337 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1338 /* chain, so that when the bits of chain are later */\
1339 /* linked together, the dups appear in the chain */\
1340 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1341 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1342 } else { \
1343 /* we haven't inserted this word yet. */ \
1344 trie->states[ state ].wordnum = curword; \
1345 } \
1346} STMT_END
07be1b83 1347
3dab1dad 1348
786e8c11
YO
1349#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1350 ( ( base + charid >= ucharcount \
1351 && base + charid < ubound \
1352 && state == trie->trans[ base - ucharcount + charid ].check \
1353 && trie->trans[ base - ucharcount + charid ].next ) \
1354 ? trie->trans[ base - ucharcount + charid ].next \
1355 : ( state==1 ? special : 0 ) \
1356 )
3dab1dad 1357
786e8c11
YO
1358#define MADE_TRIE 1
1359#define MADE_JUMP_TRIE 2
1360#define MADE_EXACT_TRIE 4
3dab1dad 1361
a3621e74 1362STATIC I32
786e8c11 1363S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1364{
27da23d5 1365 dVAR;
a3621e74
YO
1366 /* first pass, loop through and scan words */
1367 reg_trie_data *trie;
55eed653 1368 HV *widecharmap = NULL;
2b8b4781 1369 AV *revcharmap = newAV();
a3621e74 1370 regnode *cur;
9f7f3913 1371 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1372 STRLEN len = 0;
1373 UV uvc = 0;
1374 U16 curword = 0;
1375 U32 next_alloc = 0;
786e8c11
YO
1376 regnode *jumper = NULL;
1377 regnode *nextbranch = NULL;
7f69552c 1378 regnode *convert = NULL;
2e64971a 1379 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1380 /* we just use folder as a flag in utf8 */
1e696034 1381 const U8 * folder = NULL;
a3621e74 1382
2b8b4781
NC
1383#ifdef DEBUGGING
1384 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1385 AV *trie_words = NULL;
1386 /* along with revcharmap, this only used during construction but both are
1387 * useful during debugging so we store them in the struct when debugging.
8e11feef 1388 */
2b8b4781
NC
1389#else
1390 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1391 STRLEN trie_charcount=0;
3dab1dad 1392#endif
2b8b4781 1393 SV *re_trie_maxbuff;
a3621e74 1394 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1395
1396 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1397#ifndef DEBUGGING
1398 PERL_UNUSED_ARG(depth);
1399#endif
a3621e74 1400
1e696034 1401 switch (flags) {
2f7f8cb1 1402 case EXACTFA:
1e696034
KW
1403 case EXACTFU: folder = PL_fold_latin1; break;
1404 case EXACTF: folder = PL_fold; break;
1405 case EXACTFL: folder = PL_fold_locale; break;
1406 }
1407
c944940b 1408 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1409 trie->refcount = 1;
3dab1dad 1410 trie->startstate = 1;
786e8c11 1411 trie->wordcount = word_count;
f8fc2ecf 1412 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1413 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1414 if (!(UTF && folder))
c944940b 1415 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1416 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1417 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1418
a3621e74 1419 DEBUG_r({
2b8b4781 1420 trie_words = newAV();
a3621e74 1421 });
a3621e74 1422
0111c4fd 1423 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1424 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1425 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1426 }
3dab1dad
YO
1427 DEBUG_OPTIMISE_r({
1428 PerlIO_printf( Perl_debug_log,
786e8c11 1429 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1430 (int)depth * 2 + 2, "",
1431 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1432 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1433 (int)depth);
3dab1dad 1434 });
7f69552c
YO
1435
1436 /* Find the node we are going to overwrite */
1437 if ( first == startbranch && OP( last ) != BRANCH ) {
1438 /* whole branch chain */
1439 convert = first;
1440 } else {
1441 /* branch sub-chain */
1442 convert = NEXTOPER( first );
1443 }
1444
a3621e74
YO
1445 /* -- First loop and Setup --
1446
1447 We first traverse the branches and scan each word to determine if it
1448 contains widechars, and how many unique chars there are, this is
1449 important as we have to build a table with at least as many columns as we
1450 have unique chars.
1451
1452 We use an array of integers to represent the character codes 0..255
38a44b82 1453 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1454 native representation of the character value as the key and IV's for the
1455 coded index.
1456
1457 *TODO* If we keep track of how many times each character is used we can
1458 remap the columns so that the table compression later on is more
3b753521 1459 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1460 middle and the least common are on the outside. IMO this would be better
1461 than a most to least common mapping as theres a decent chance the most
1462 common letter will share a node with the least common, meaning the node
486ec47a 1463 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1464 case is when we have the least common nodes twice.
1465
1466 */
1467
a3621e74 1468 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1469 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1470 const U8 *uc = (U8*)STRING( noper );
a28509cc 1471 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1472 STRLEN foldlen = 0;
1473 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1474 const U8 *scan = (U8*)NULL;
07be1b83 1475 U32 wordlen = 0; /* required init */
02daf0ab
YO
1476 STRLEN chars = 0;
1477 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1478
3dab1dad
YO
1479 if (OP(noper) == NOTHING) {
1480 trie->minlen= 0;
1481 continue;
1482 }
02daf0ab
YO
1483 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1484 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1485 regardless of encoding */
1486
a3621e74 1487 for ( ; uc < e ; uc += len ) {
3dab1dad 1488 TRIE_CHARCOUNT(trie)++;
a3621e74 1489 TRIE_READ_CHAR;
3dab1dad 1490 chars++;
a3621e74
YO
1491 if ( uvc < 256 ) {
1492 if ( !trie->charmap[ uvc ] ) {
1493 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1494 if ( folder )
1495 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1496 TRIE_STORE_REVCHAR;
a3621e74 1497 }
02daf0ab 1498 if ( set_bit ) {
62012aee
KW
1499 /* store the codepoint in the bitmap, and its folded
1500 * equivalent. */
02daf0ab 1501 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1502
1503 /* store the folded codepoint */
1504 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1505
1506 if ( !UTF ) {
1507 /* store first byte of utf8 representation of
acdf4139
KW
1508 variant codepoints */
1509 if (! UNI_IS_INVARIANT(uvc)) {
1510 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1511 }
1512 }
02daf0ab
YO
1513 set_bit = 0; /* We've done our bit :-) */
1514 }
a3621e74
YO
1515 } else {
1516 SV** svpp;
55eed653
NC
1517 if ( !widecharmap )
1518 widecharmap = newHV();
a3621e74 1519
55eed653 1520 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1521
1522 if ( !svpp )
e4584336 1523 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1524
1525 if ( !SvTRUE( *svpp ) ) {
1526 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1527 TRIE_STORE_REVCHAR;
a3621e74
YO
1528 }
1529 }
1530 }
3dab1dad
YO
1531 if( cur == first ) {
1532 trie->minlen=chars;
1533 trie->maxlen=chars;
1534 } else if (chars < trie->minlen) {
1535 trie->minlen=chars;
1536 } else if (chars > trie->maxlen) {
1537 trie->maxlen=chars;
1538 }
1539
a3621e74
YO
1540 } /* end first pass */
1541 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1542 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1543 (int)depth * 2 + 2,"",
55eed653 1544 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1545 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1546 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1547 );
a3621e74
YO
1548
1549 /*
1550 We now know what we are dealing with in terms of unique chars and
1551 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1552 representation using a flat table will take. If it's over a reasonable
1553 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1554 conservative but potentially much slower representation using an array
1555 of lists.
1556
1557 At the end we convert both representations into the same compressed
1558 form that will be used in regexec.c for matching with. The latter
1559 is a form that cannot be used to construct with but has memory
1560 properties similar to the list form and access properties similar
1561 to the table form making it both suitable for fast searches and
1562 small enough that its feasable to store for the duration of a program.
1563
1564 See the comment in the code where the compressed table is produced
1565 inplace from the flat tabe representation for an explanation of how
1566 the compression works.
1567
1568 */
1569
1570
2e64971a
DM
1571 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1572 prev_states[1] = 0;
1573
3dab1dad 1574 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1575 /*
1576 Second Pass -- Array Of Lists Representation
1577
1578 Each state will be represented by a list of charid:state records
1579 (reg_trie_trans_le) the first such element holds the CUR and LEN
1580 points of the allocated array. (See defines above).
1581
1582 We build the initial structure using the lists, and then convert
1583 it into the compressed table form which allows faster lookups
1584 (but cant be modified once converted).
a3621e74
YO
1585 */
1586
a3621e74
YO
1587 STRLEN transcount = 1;
1588
1e2e3d02
YO
1589 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1590 "%*sCompiling trie using list compiler\n",
1591 (int)depth * 2 + 2, ""));
446bd890 1592
c944940b
JH
1593 trie->states = (reg_trie_state *)
1594 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1595 sizeof(reg_trie_state) );
a3621e74
YO
1596 TRIE_LIST_NEW(1);
1597 next_alloc = 2;
1598
1599 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1600
c445ea15
AL
1601 regnode * const noper = NEXTOPER( cur );
1602 U8 *uc = (U8*)STRING( noper );
1603 const U8 * const e = uc + STR_LEN( noper );
1604 U32 state = 1; /* required init */
1605 U16 charid = 0; /* sanity init */
1606 U8 *scan = (U8*)NULL; /* sanity init */
1607 STRLEN foldlen = 0; /* required init */
07be1b83 1608 U32 wordlen = 0; /* required init */
c445ea15
AL
1609 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1610
3dab1dad 1611 if (OP(noper) != NOTHING) {
786e8c11 1612 for ( ; uc < e ; uc += len ) {
c445ea15 1613
786e8c11 1614 TRIE_READ_CHAR;
c445ea15 1615
786e8c11
YO
1616 if ( uvc < 256 ) {
1617 charid = trie->charmap[ uvc ];
c445ea15 1618 } else {
55eed653 1619 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1620 if ( !svpp ) {
1621 charid = 0;
1622 } else {
1623 charid=(U16)SvIV( *svpp );
1624 }
c445ea15 1625 }
786e8c11
YO
1626 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1627 if ( charid ) {
a3621e74 1628
786e8c11
YO
1629 U16 check;
1630 U32 newstate = 0;
a3621e74 1631
786e8c11
YO
1632 charid--;
1633 if ( !trie->states[ state ].trans.list ) {
1634 TRIE_LIST_NEW( state );
c445ea15 1635 }
786e8c11
YO
1636 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1637 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1638 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1639 break;
1640 }
1641 }
1642 if ( ! newstate ) {
1643 newstate = next_alloc++;
2e64971a 1644 prev_states[newstate] = state;
786e8c11
YO
1645 TRIE_LIST_PUSH( state, charid, newstate );
1646 transcount++;
1647 }
1648 state = newstate;
1649 } else {
1650 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1651 }
a28509cc 1652 }
c445ea15 1653 }
3dab1dad 1654 TRIE_HANDLE_WORD(state);
a3621e74
YO
1655
1656 } /* end second pass */
1657
1e2e3d02
YO
1658 /* next alloc is the NEXT state to be allocated */
1659 trie->statecount = next_alloc;
c944940b
JH
1660 trie->states = (reg_trie_state *)
1661 PerlMemShared_realloc( trie->states,
1662 next_alloc
1663 * sizeof(reg_trie_state) );
a3621e74 1664
3dab1dad 1665 /* and now dump it out before we compress it */
2b8b4781
NC
1666 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1667 revcharmap, next_alloc,
1668 depth+1)
1e2e3d02 1669 );
a3621e74 1670
c944940b
JH
1671 trie->trans = (reg_trie_trans *)
1672 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1673 {
1674 U32 state;
a3621e74
YO
1675 U32 tp = 0;
1676 U32 zp = 0;
1677
1678
1679 for( state=1 ; state < next_alloc ; state ++ ) {
1680 U32 base=0;
1681
1682 /*
1683 DEBUG_TRIE_COMPILE_MORE_r(
1684 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1685 );
1686 */
1687
1688 if (trie->states[state].trans.list) {
1689 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1690 U16 maxid=minid;
a28509cc 1691 U16 idx;
a3621e74
YO
1692
1693 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1694 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1695 if ( forid < minid ) {
1696 minid=forid;
1697 } else if ( forid > maxid ) {
1698 maxid=forid;
1699 }
a3621e74
YO
1700 }
1701 if ( transcount < tp + maxid - minid + 1) {
1702 transcount *= 2;
c944940b
JH
1703 trie->trans = (reg_trie_trans *)
1704 PerlMemShared_realloc( trie->trans,
446bd890
NC
1705 transcount
1706 * sizeof(reg_trie_trans) );
a3621e74
YO
1707 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1708 }
1709 base = trie->uniquecharcount + tp - minid;
1710 if ( maxid == minid ) {
1711 U32 set = 0;
1712 for ( ; zp < tp ; zp++ ) {
1713 if ( ! trie->trans[ zp ].next ) {
1714 base = trie->uniquecharcount + zp - minid;
1715 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1716 trie->trans[ zp ].check = state;
1717 set = 1;
1718 break;
1719 }
1720 }
1721 if ( !set ) {
1722 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1723 trie->trans[ tp ].check = state;
1724 tp++;
1725 zp = tp;
1726 }
1727 } else {
1728 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1729 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1730 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1731 trie->trans[ tid ].check = state;
1732 }
1733 tp += ( maxid - minid + 1 );
1734 }
1735 Safefree(trie->states[ state ].trans.list);
1736 }
1737 /*
1738 DEBUG_TRIE_COMPILE_MORE_r(
1739 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1740 );
1741 */
1742 trie->states[ state ].trans.base=base;
1743 }
cc601c31 1744 trie->lasttrans = tp + 1;
a3621e74
YO
1745 }
1746 } else {
1747 /*
1748 Second Pass -- Flat Table Representation.
1749
1750 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1751 We know that we will need Charcount+1 trans at most to store the data
1752 (one row per char at worst case) So we preallocate both structures
1753 assuming worst case.
1754
1755 We then construct the trie using only the .next slots of the entry
1756 structs.
1757
3b753521 1758 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1759 make compression both faster and easier by keeping track of how many non
1760 zero fields are in the node.
1761
1762 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1763 transition.
1764
1765 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1766 number representing the first entry of the node, and state as a
1767 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1768 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1769 are 2 entrys per node. eg:
1770
1771 A B A B
1772 1. 2 4 1. 3 7
1773 2. 0 3 3. 0 5
1774 3. 0 0 5. 0 0
1775 4. 0 0 7. 0 0
1776
1777 The table is internally in the right hand, idx form. However as we also
1778 have to deal with the states array which is indexed by nodenum we have to
1779 use TRIE_NODENUM() to convert.
1780
1781 */
1e2e3d02
YO
1782 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1783 "%*sCompiling trie using table compiler\n",
1784 (int)depth * 2 + 2, ""));
3dab1dad 1785
c944940b
JH
1786 trie->trans = (reg_trie_trans *)
1787 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1788 * trie->uniquecharcount + 1,
1789 sizeof(reg_trie_trans) );
1790 trie->states = (reg_trie_state *)
1791 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1792 sizeof(reg_trie_state) );
a3621e74
YO
1793 next_alloc = trie->uniquecharcount + 1;
1794
3dab1dad 1795
a3621e74
YO
1796 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1797
c445ea15 1798 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1799 const U8 *uc = (U8*)STRING( noper );
1800 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1801
1802 U32 state = 1; /* required init */
1803
1804 U16 charid = 0; /* sanity init */
1805 U32 accept_state = 0; /* sanity init */
1806 U8 *scan = (U8*)NULL; /* sanity init */
1807
1808 STRLEN foldlen = 0; /* required init */
07be1b83 1809 U32 wordlen = 0; /* required init */
a3621e74
YO
1810 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1811
3dab1dad 1812 if ( OP(noper) != NOTHING ) {
786e8c11 1813 for ( ; uc < e ; uc += len ) {
a3621e74 1814
786e8c11 1815 TRIE_READ_CHAR;
a3621e74 1816
786e8c11
YO
1817 if ( uvc < 256 ) {
1818 charid = trie->charmap[ uvc ];
1819 } else {
55eed653 1820 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1821 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1822 }
786e8c11
YO
1823 if ( charid ) {
1824 charid--;
1825 if ( !trie->trans[ state + charid ].next ) {
1826 trie->trans[ state + charid ].next = next_alloc;
1827 trie->trans[ state ].check++;
2e64971a
DM
1828 prev_states[TRIE_NODENUM(next_alloc)]
1829 = TRIE_NODENUM(state);
786e8c11
YO
1830 next_alloc += trie->uniquecharcount;
1831 }
1832 state = trie->trans[ state + charid ].next;
1833 } else {
1834 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1835 }
1836 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1837 }
a3621e74 1838 }
3dab1dad
YO
1839 accept_state = TRIE_NODENUM( state );
1840 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1841
1842 } /* end second pass */
1843
3dab1dad 1844 /* and now dump it out before we compress it */
2b8b4781
NC
1845 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1846 revcharmap,
1847 next_alloc, depth+1));
a3621e74 1848
a3621e74
YO
1849 {
1850 /*
1851 * Inplace compress the table.*
1852
1853 For sparse data sets the table constructed by the trie algorithm will
1854 be mostly 0/FAIL transitions or to put it another way mostly empty.
1855 (Note that leaf nodes will not contain any transitions.)
1856
1857 This algorithm compresses the tables by eliminating most such
1858 transitions, at the cost of a modest bit of extra work during lookup:
1859
1860 - Each states[] entry contains a .base field which indicates the
1861 index in the state[] array wheres its transition data is stored.
1862
3b753521 1863 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1864
1865 - If .base is nonzero then charid is added to it to find an entry in
1866 the trans array.
1867
1868 -If trans[states[state].base+charid].check!=state then the
1869 transition is taken to be a 0/Fail transition. Thus if there are fail
1870 transitions at the front of the node then the .base offset will point
1871 somewhere inside the previous nodes data (or maybe even into a node
1872 even earlier), but the .check field determines if the transition is
1873 valid.
1874
786e8c11 1875 XXX - wrong maybe?
a3621e74 1876 The following process inplace converts the table to the compressed
3b753521 1877 table: We first do not compress the root node 1,and mark all its
a3621e74 1878 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1879 allows us to do a DFA construction from the compressed table later,
1880 and ensures that any .base pointers we calculate later are greater
1881 than 0.
a3621e74
YO
1882
1883 - We set 'pos' to indicate the first entry of the second node.
1884
1885 - We then iterate over the columns of the node, finding the first and
1886 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1887 and set the .check pointers accordingly, and advance pos
1888 appropriately and repreat for the next node. Note that when we copy
1889 the next pointers we have to convert them from the original
1890 NODEIDX form to NODENUM form as the former is not valid post
1891 compression.
1892
1893 - If a node has no transitions used we mark its base as 0 and do not
1894 advance the pos pointer.
1895
1896 - If a node only has one transition we use a second pointer into the
1897 structure to fill in allocated fail transitions from other states.
1898 This pointer is independent of the main pointer and scans forward
1899 looking for null transitions that are allocated to a state. When it
1900 finds one it writes the single transition into the "hole". If the
786e8c11 1901 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1902
1903 - Once compressed we can Renew/realloc the structures to release the
1904 excess space.
1905
1906 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1907 specifically Fig 3.47 and the associated pseudocode.
1908
1909 demq
1910 */
a3b680e6 1911 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1912 U32 state, charid;
a3621e74 1913 U32 pos = 0, zp=0;
1e2e3d02 1914 trie->statecount = laststate;
a3621e74
YO
1915
1916 for ( state = 1 ; state < laststate ; state++ ) {
1917 U8 flag = 0;
a28509cc
AL
1918 const U32 stateidx = TRIE_NODEIDX( state );
1919 const U32 o_used = trie->trans[ stateidx ].check;
1920 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1921 trie->trans[ stateidx ].check = 0;
1922
1923 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1924 if ( flag || trie->trans[ stateidx + charid ].next ) {
1925 if ( trie->trans[ stateidx + charid ].next ) {
1926 if (o_used == 1) {
1927 for ( ; zp < pos ; zp++ ) {
1928 if ( ! trie->trans[ zp ].next ) {
1929 break;
1930 }
1931 }
1932 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1933 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1934 trie->trans[ zp ].check = state;
1935 if ( ++zp > pos ) pos = zp;
1936 break;
1937 }
1938 used--;
1939 }
1940 if ( !flag ) {
1941 flag = 1;
1942 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1943 }
1944 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1945 trie->trans[ pos ].check = state;
1946 pos++;
1947 }
1948 }
1949 }
cc601c31 1950 trie->lasttrans = pos + 1;
c944940b
JH
1951 trie->states = (reg_trie_state *)
1952 PerlMemShared_realloc( trie->states, laststate
1953 * sizeof(reg_trie_state) );
a3621e74 1954 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1955 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1956 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1957 (int)depth * 2 + 2,"",
1958 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1959 (IV)next_alloc,
1960 (IV)pos,
a3621e74
YO
1961 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1962 );
1963
1964 } /* end table compress */
1965 }
1e2e3d02
YO
1966 DEBUG_TRIE_COMPILE_MORE_r(
1967 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1968 (int)depth * 2 + 2, "",
1969 (UV)trie->statecount,
1970 (UV)trie->lasttrans)
1971 );
cc601c31 1972 /* resize the trans array to remove unused space */
c944940b
JH
1973 trie->trans = (reg_trie_trans *)
1974 PerlMemShared_realloc( trie->trans, trie->lasttrans
1975 * sizeof(reg_trie_trans) );
a3621e74 1976
3b753521 1977 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1978 U8 nodetype =(U8)(flags & 0xFF);
1979 char *str=NULL;
786e8c11 1980
07be1b83 1981#ifdef DEBUGGING
e62cc96a 1982 regnode *optimize = NULL;
7122b237
YO
1983#ifdef RE_TRACK_PATTERN_OFFSETS
1984
b57a0404
JH
1985 U32 mjd_offset = 0;
1986 U32 mjd_nodelen = 0;
7122b237
YO
1987#endif /* RE_TRACK_PATTERN_OFFSETS */
1988#endif /* DEBUGGING */
a3621e74 1989 /*
3dab1dad
YO
1990 This means we convert either the first branch or the first Exact,
1991 depending on whether the thing following (in 'last') is a branch
1992 or not and whther first is the startbranch (ie is it a sub part of
1993 the alternation or is it the whole thing.)
3b753521 1994 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1995 the whole branch sequence, including the first.
a3621e74 1996 */
3dab1dad 1997 /* Find the node we are going to overwrite */
7f69552c 1998 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1999 /* branch sub-chain */
3dab1dad 2000 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2001#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2002 DEBUG_r({
2003 mjd_offset= Node_Offset((convert));
2004 mjd_nodelen= Node_Length((convert));
2005 });
7122b237 2006#endif
7f69552c 2007 /* whole branch chain */
7122b237
YO
2008 }
2009#ifdef RE_TRACK_PATTERN_OFFSETS
2010 else {
7f69552c
YO
2011 DEBUG_r({
2012 const regnode *nop = NEXTOPER( convert );
2013 mjd_offset= Node_Offset((nop));
2014 mjd_nodelen= Node_Length((nop));
2015 });
07be1b83
YO
2016 }
2017 DEBUG_OPTIMISE_r(
2018 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2019 (int)depth * 2 + 2, "",
786e8c11 2020 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2021 );
7122b237 2022#endif
3dab1dad
YO
2023 /* But first we check to see if there is a common prefix we can
2024 split out as an EXACT and put in front of the TRIE node. */
2025 trie->startstate= 1;
55eed653 2026 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2027 U32 state;
1e2e3d02 2028 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2029 U32 ofs = 0;
8e11feef
RGS
2030 I32 idx = -1;
2031 U32 count = 0;
2032 const U32 base = trie->states[ state ].trans.base;
a3621e74 2033
3dab1dad 2034 if ( trie->states[state].wordnum )
8e11feef 2035 count = 1;
a3621e74 2036
8e11feef 2037 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2038 if ( ( base + ofs >= trie->uniquecharcount ) &&
2039 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2040 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2041 {
3dab1dad 2042 if ( ++count > 1 ) {
2b8b4781 2043 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2044 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2045 if ( state == 1 ) break;
3dab1dad
YO
2046 if ( count == 2 ) {
2047 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2048 DEBUG_OPTIMISE_r(
8e11feef
RGS
2049 PerlIO_printf(Perl_debug_log,
2050 "%*sNew Start State=%"UVuf" Class: [",
2051 (int)depth * 2 + 2, "",
786e8c11 2052 (UV)state));
be8e71aa 2053 if (idx >= 0) {
2b8b4781 2054 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2055 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2056
3dab1dad 2057 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2058 if ( folder )
2059 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2060 DEBUG_OPTIMISE_r(
f1f66076 2061 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2062 );
8e11feef
RGS
2063 }
2064 }
2065 TRIE_BITMAP_SET(trie,*ch);
2066 if ( folder )
2067 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2068 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2069 }
2070 idx = ofs;
2071 }
3dab1dad
YO
2072 }
2073 if ( count == 1 ) {
2b8b4781 2074 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2075 STRLEN len;
2076 char *ch = SvPV( *tmp, len );
de734bd5
A
2077 DEBUG_OPTIMISE_r({
2078 SV *sv=sv_newmortal();
8e11feef
RGS
2079 PerlIO_printf( Perl_debug_log,
2080 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2081 (int)depth * 2 + 2, "",
de734bd5
A
2082 (UV)state, (UV)idx,
2083 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2084 PL_colors[0], PL_colors[1],
2085 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2086 PERL_PV_ESCAPE_FIRSTCHAR
2087 )
2088 );
2089 });
3dab1dad
YO
2090 if ( state==1 ) {
2091 OP( convert ) = nodetype;
2092 str=STRING(convert);
2093 STR_LEN(convert)=0;
2094 }
c490c714
YO
2095 STR_LEN(convert) += len;
2096 while (len--)
de734bd5 2097 *str++ = *ch++;
8e11feef 2098 } else {
f9049ba1 2099#ifdef DEBUGGING
8e11feef
RGS
2100 if (state>1)
2101 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2102#endif
8e11feef
RGS
2103 break;
2104 }
2105 }
2e64971a 2106 trie->prefixlen = (state-1);
3dab1dad 2107 if (str) {
8e11feef 2108 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2109 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2110 trie->startstate = state;
07be1b83
YO
2111 trie->minlen -= (state - 1);
2112 trie->maxlen -= (state - 1);
33809eae
JH
2113#ifdef DEBUGGING
2114 /* At least the UNICOS C compiler choked on this
2115 * being argument to DEBUG_r(), so let's just have
2116 * it right here. */
2117 if (
2118#ifdef PERL_EXT_RE_BUILD
2119 1
2120#else
2121 DEBUG_r_TEST
2122#endif
2123 ) {
2124 regnode *fix = convert;
2125 U32 word = trie->wordcount;
2126 mjd_nodelen++;
2127 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2128 while( ++fix < n ) {
2129 Set_Node_Offset_Length(fix, 0, 0);
2130 }
2131 while (word--) {
2132 SV ** const tmp = av_fetch( trie_words, word, 0 );
2133 if (tmp) {
2134 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2135 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2136 else
2137 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2138 }
2139 }
2140 }
2141#endif
8e11feef
RGS
2142 if (trie->maxlen) {
2143 convert = n;
2144 } else {
3dab1dad 2145 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2146 DEBUG_r(optimize= n);
3dab1dad
YO
2147 }
2148 }
2149 }
a5ca303d
YO
2150 if (!jumper)
2151 jumper = last;
3dab1dad 2152 if ( trie->maxlen ) {
8e11feef
RGS
2153 NEXT_OFF( convert ) = (U16)(tail - convert);
2154 ARG_SET( convert, data_slot );
786e8c11
YO
2155 /* Store the offset to the first unabsorbed branch in
2156 jump[0], which is otherwise unused by the jump logic.
2157 We use this when dumping a trie and during optimisation. */
2158 if (trie->jump)
7f69552c 2159 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2160
6c48061a
YO
2161 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2162 * and there is a bitmap
2163 * and the first "jump target" node we found leaves enough room
2164 * then convert the TRIE node into a TRIEC node, with the bitmap
2165 * embedded inline in the opcode - this is hypothetically faster.
2166 */
2167 if ( !trie->states[trie->startstate].wordnum
2168 && trie->bitmap
2169 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2170 {
2171 OP( convert ) = TRIEC;
2172 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2173 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2174 trie->bitmap= NULL;
2175 } else
2176 OP( convert ) = TRIE;
a3621e74 2177
3dab1dad
YO
2178 /* store the type in the flags */
2179 convert->flags = nodetype;
a5ca303d
YO
2180 DEBUG_r({
2181 optimize = convert
2182 + NODE_STEP_REGNODE
2183 + regarglen[ OP( convert ) ];
2184 });
2185 /* XXX We really should free up the resource in trie now,
2186 as we won't use them - (which resources?) dmq */
3dab1dad 2187 }
a3621e74 2188 /* needed for dumping*/
e62cc96a 2189 DEBUG_r(if (optimize) {
07be1b83 2190 regnode *opt = convert;
bcdf7404 2191
e62cc96a 2192 while ( ++opt < optimize) {
07be1b83
YO
2193 Set_Node_Offset_Length(opt,0,0);
2194 }
786e8c11
YO
2195 /*
2196 Try to clean up some of the debris left after the
2197 optimisation.
a3621e74 2198 */
786e8c11 2199 while( optimize < jumper ) {
07be1b83 2200 mjd_nodelen += Node_Length((optimize));
a3621e74 2201 OP( optimize ) = OPTIMIZED;
07be1b83 2202 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2203 optimize++;
2204 }
07be1b83 2205 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2206 });
2207 } /* end node insert */
2e64971a
DM
2208
2209 /* Finish populating the prev field of the wordinfo array. Walk back
2210 * from each accept state until we find another accept state, and if
2211 * so, point the first word's .prev field at the second word. If the
2212 * second already has a .prev field set, stop now. This will be the
2213 * case either if we've already processed that word's accept state,
3b753521
FN
2214 * or that state had multiple words, and the overspill words were
2215 * already linked up earlier.
2e64971a
DM
2216 */
2217 {
2218 U16 word;
2219 U32 state;
2220 U16 prev;
2221
2222 for (word=1; word <= trie->wordcount; word++) {
2223 prev = 0;
2224 if (trie->wordinfo[word].prev)
2225 continue;
2226 state = trie->wordinfo[word].accept;
2227 while (state) {
2228 state = prev_states[state];
2229 if (!state)
2230 break;
2231 prev = trie->states[state].wordnum;
2232 if (prev)
2233 break;
2234 }
2235 trie->wordinfo[word].prev = prev;
2236 }
2237 Safefree(prev_states);
2238 }
2239
2240
2241 /* and now dump out the compressed format */
2242 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2243
55eed653 2244 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2245#ifdef DEBUGGING
2246 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2247 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2248#else
2249 SvREFCNT_dec(revcharmap);
07be1b83 2250#endif
786e8c11
YO
2251 return trie->jump
2252 ? MADE_JUMP_TRIE
2253 : trie->startstate>1
2254 ? MADE_EXACT_TRIE
2255 : MADE_TRIE;
2256}
2257
2258STATIC void
2259S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2260{
3b753521 2261/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2262
2263 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2264 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2265 ISBN 0-201-10088-6
2266
2267 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2268 suffix of the current state's 'word' that is also a proper prefix of another word in our
2269 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2270 the DFA not to have to restart after its tried and failed a word at a given point, it
2271 simply continues as though it had been matching the other word in the first place.
2272 Consider
2273 'abcdgu'=~/abcdefg|cdgu/
2274 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2275 fail, which would bring us to the state representing 'd' in the second word where we would
2276 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2277 */
2278 /* add a fail transition */
3251b653
NC
2279 const U32 trie_offset = ARG(source);
2280 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2281 U32 *q;
2282 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2283 const U32 numstates = trie->statecount;
786e8c11
YO
2284 const U32 ubound = trie->lasttrans + ucharcount;
2285 U32 q_read = 0;
2286 U32 q_write = 0;
2287 U32 charid;
2288 U32 base = trie->states[ 1 ].trans.base;
2289 U32 *fail;
2290 reg_ac_data *aho;
2291 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2292 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2293
2294 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2295#ifndef DEBUGGING
2296 PERL_UNUSED_ARG(depth);
2297#endif
2298
2299
2300 ARG_SET( stclass, data_slot );
c944940b 2301 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2302 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2303 aho->trie=trie_offset;
446bd890
NC
2304 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2305 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2306 Newxz( q, numstates, U32);
c944940b 2307 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2308 aho->refcount = 1;
2309 fail = aho->fail;
2310 /* initialize fail[0..1] to be 1 so that we always have
2311 a valid final fail state */
2312 fail[ 0 ] = fail[ 1 ] = 1;
2313
2314 for ( charid = 0; charid < ucharcount ; charid++ ) {
2315 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2316 if ( newstate ) {
2317 q[ q_write ] = newstate;
2318 /* set to point at the root */
2319 fail[ q[ q_write++ ] ]=1;
2320 }
2321 }
2322 while ( q_read < q_write) {
2323 const U32 cur = q[ q_read++ % numstates ];
2324 base = trie->states[ cur ].trans.base;
2325
2326 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2327 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2328 if (ch_state) {
2329 U32 fail_state = cur;
2330 U32 fail_base;
2331 do {
2332 fail_state = fail[ fail_state ];
2333 fail_base = aho->states[ fail_state ].trans.base;
2334 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2335
2336 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2337 fail[ ch_state ] = fail_state;
2338 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2339 {
2340 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2341 }
2342 q[ q_write++ % numstates] = ch_state;
2343 }
2344 }
2345 }
2346 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2347 when we fail in state 1, this allows us to use the
2348 charclass scan to find a valid start char. This is based on the principle
2349 that theres a good chance the string being searched contains lots of stuff
2350 that cant be a start char.
2351 */
2352 fail[ 0 ] = fail[ 1 ] = 0;
2353 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2354 PerlIO_printf(Perl_debug_log,
2355 "%*sStclass Failtable (%"UVuf" states): 0",
2356 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2357 );
786e8c11
YO
2358 for( q_read=1; q_read<numstates; q_read++ ) {
2359 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2360 }
2361 PerlIO_printf(Perl_debug_log, "\n");
2362 });
2363 Safefree(q);
2364 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2365}
2366
786e8c11 2367
a3621e74 2368/*
5d1c421c
JH
2369 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2370 * These need to be revisited when a newer toolchain becomes available.
2371 */
2372#if defined(__sparc64__) && defined(__GNUC__)
2373# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2374# undef SPARC64_GCC_WORKAROUND
2375# define SPARC64_GCC_WORKAROUND 1
2376# endif
2377#endif
2378
07be1b83 2379#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2380 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2381 SV * const mysv=sv_newmortal(); \
2382 regnode *Next = regnext(scan); \
2383 regprop(RExC_rx, mysv, scan); \
7f69552c 2384 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2385 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2386 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2387 }});
07be1b83 2388
1de06328
YO
2389
2390
2391
2392
07be1b83
YO
2393#define JOIN_EXACT(scan,min,flags) \
2394 if (PL_regkind[OP(scan)] == EXACT) \
2395 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2396
be8e71aa 2397STATIC U32
07be1b83
YO
2398S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2399 /* Merge several consecutive EXACTish nodes into one. */
2400 regnode *n = regnext(scan);
2401 U32 stringok = 1;
2402 regnode *next = scan + NODE_SZ_STR(scan);
2403 U32 merged = 0;
2404 U32 stopnow = 0;
2405#ifdef DEBUGGING
2406 regnode *stop = scan;
72f13be8 2407 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2408#else
d47053eb
RGS
2409 PERL_UNUSED_ARG(depth);
2410#endif
7918f24d
NC
2411
2412 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2413#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2414 PERL_UNUSED_ARG(flags);
2415 PERL_UNUSED_ARG(val);
07be1b83 2416#endif
07be1b83
YO
2417 DEBUG_PEEP("join",scan,depth);
2418
2419 /* Skip NOTHING, merge EXACT*. */
2420 while (n &&
2421 ( PL_regkind[OP(n)] == NOTHING ||
2422 (stringok && (OP(n) == OP(scan))))
2423 && NEXT_OFF(n)
2424 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2425
2426 if (OP(n) == TAIL || n > next)
2427 stringok = 0;
2428 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2429 DEBUG_PEEP("skip:",n,depth);
2430 NEXT_OFF(scan) += NEXT_OFF(n);
2431 next = n + NODE_STEP_REGNODE;
2432#ifdef DEBUGGING
2433 if (stringok)
2434 stop = n;
2435#endif
2436 n = regnext(n);
2437 }
2438 else if (stringok) {
786e8c11 2439 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2440 regnode * const nnext = regnext(n);
2441
2442 DEBUG_PEEP("merg",n,depth);
2443
2444 merged++;
2445 if (oldl + STR_LEN(n) > U8_MAX)
2446 break;
2447 NEXT_OFF(scan) += NEXT_OFF(n);
2448 STR_LEN(scan) += STR_LEN(n);
2449 next = n + NODE_SZ_STR(n);
2450 /* Now we can overwrite *n : */
2451 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2452#ifdef DEBUGGING
2453 stop = next - 1;
2454#endif
2455 n = nnext;
2456 if (stopnow) break;
2457 }
2458
d47053eb
RGS
2459#ifdef EXPERIMENTAL_INPLACESCAN
2460 if (flags && !NEXT_OFF(n)) {
2461 DEBUG_PEEP("atch", val, depth);
2462 if (reg_off_by_arg[OP(n)]) {
2463 ARG_SET(n, val - n);
2464 }
2465 else {
2466 NEXT_OFF(n) = val - n;
2467 }
2468 stopnow = 1;
2469 }
07be1b83
YO
2470#endif
2471 }
ced7f090
KW
2472#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2473#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2474#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2475#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2476
2477 if (UTF
2f7f8cb1 2478 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2c2b7f86
KW
2479 && ( STR_LEN(scan) >= 6 ) )
2480 {
07be1b83
YO
2481 /*
2482 Two problematic code points in Unicode casefolding of EXACT nodes:
2483
2484 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2485 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2486
2487 which casefold to
2488
2489 Unicode UTF-8
2490
2491 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2492 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2493
2494 This means that in case-insensitive matching (or "loose matching",
2495 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2496 length of the above casefolded versions) can match a target string
2497 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2498 This would rather mess up the minimum length computation.
2499
2500 What we'll do is to look for the tail four bytes, and then peek
2501 at the preceding two bytes to see whether we need to decrease
2502 the minimum length by four (six minus two).
2503
2504 Thanks to the design of UTF-8, there cannot be false matches:
2505 A sequence of valid UTF-8 bytes cannot be a subsequence of
2506 another valid sequence of UTF-8 bytes.
2507
2508 */
2509 char * const s0 = STRING(scan), *s, *t;
2510 char * const s1 = s0 + STR_LEN(scan) - 1;
2511 char * const s2 = s1 - 4;
e294cc5d
JH
2512#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2513 const char t0[] = "\xaf\x49\xaf\x42";
2514#else
07be1b83 2515 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2516#endif
07be1b83
YO
2517 const char * const t1 = t0 + 3;
2518
2519 for (s = s0 + 2;
2520 s < s2 && (t = ninstr(s, s1, t0, t1));
2521 s = t + 4) {
e294cc5d
JH
2522#ifdef EBCDIC
2523 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2524 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2525#else
07be1b83
YO
2526 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2527 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2528#endif
07be1b83
YO
2529 *min -= 4;
2530 }
2531 }
2532
2533#ifdef DEBUGGING
2534 /* Allow dumping */
2535 n = scan + NODE_SZ_STR(scan);
2536 while (n <= stop) {
2537 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2538 OP(n) = OPTIMIZED;
2539 NEXT_OFF(n) = 0;
2540 }
2541 n++;
2542 }
2543#endif
2544 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2545 return stopnow;
2546}
2547
486ec47a 2548/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2549 Finds fixed substrings. */
2550
a0288114 2551/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2552 to the position after last scanned or to NULL. */
2553
40d049e4
YO
2554#define INIT_AND_WITHP \
2555 assert(!and_withp); \
2556 Newx(and_withp,1,struct regnode_charclass_class); \
2557 SAVEFREEPV(and_withp)
07be1b83 2558
b515a41d 2559/* this is a chain of data about sub patterns we are processing that
486ec47a 2560 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2561 we can simulate recursion without losing state. */
2562struct scan_frame;
2563typedef struct scan_frame {
2564 regnode *last; /* last node to process in this frame */
2565 regnode *next; /* next node to process when last is reached */
2566 struct scan_frame *prev; /*previous frame*/
2567 I32 stop; /* what stopparen do we use */
2568} scan_frame;
2569
304ee84b
YO
2570
2571#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2572
e1d1eefb
YO
2573#define CASE_SYNST_FNC(nAmE) \
2574case nAmE: \
2575 if (flags & SCF_DO_STCLASS_AND) { \
2576 for (value = 0; value < 256; value++) \
2577 if (!is_ ## nAmE ## _cp(value)) \
2578 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2579 } \
2580 else { \
2581 for (value = 0; value < 256; value++) \
2582 if (is_ ## nAmE ## _cp(value)) \
2583 ANYOF_BITMAP_SET(data->start_class, value); \
2584 } \
2585 break; \
2586case N ## nAmE: \
2587 if (flags & SCF_DO_STCLASS_AND) { \
2588 for (value = 0; value < 256; value++) \
2589 if (is_ ## nAmE ## _cp(value)) \
2590 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2591 } \
2592 else { \
2593 for (value = 0; value < 256; value++) \
2594 if (!is_ ## nAmE ## _cp(value)) \
2595 ANYOF_BITMAP_SET(data->start_class, value); \
2596 } \
2597 break
2598
2599
2600
76e3520e 2601STATIC I32
40d049e4 2602S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2603 I32 *minlenp, I32 *deltap,
40d049e4
YO
2604 regnode *last,
2605 scan_data_t *data,
2606 I32 stopparen,
2607 U8* recursed,
2608 struct regnode_charclass_class *and_withp,
2609 U32 flags, U32 depth)
c277df42
IZ
2610 /* scanp: Start here (read-write). */
2611 /* deltap: Write maxlen-minlen here. */
2612 /* last: Stop before this one. */
40d049e4
YO
2613 /* data: string data about the pattern */
2614 /* stopparen: treat close N as END */
2615 /* recursed: which subroutines have we recursed into */
2616 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2617{
97aff369 2618 dVAR;
c277df42
IZ
2619 I32 min = 0, pars = 0, code;
2620 regnode *scan = *scanp, *next;
2621 I32 delta = 0;
2622 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2623 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2624 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2625 scan_data_t data_fake;
a3621e74 2626 SV *re_trie_maxbuff = NULL;
786e8c11 2627 regnode *first_non_open = scan;
e2e6a0f1 2628 I32 stopmin = I32_MAX;
8aa23a47 2629 scan_frame *frame = NULL;
a3621e74 2630 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2631
7918f24d
NC
2632 PERL_ARGS_ASSERT_STUDY_CHUNK;
2633
13a24bad 2634#ifdef DEBUGGING
40d049e4 2635 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2636#endif
40d049e4 2637
786e8c11 2638 if ( depth == 0 ) {
40d049e4 2639 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2640 first_non_open=regnext(first_non_open);
2641 }
2642
b81d288d 2643
8aa23a47
YO
2644 fake_study_recurse:
2645 while ( scan && OP(scan) != END && scan < last ){
2646 /* Peephole optimizer: */
304ee84b 2647 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2648 DEBUG_PEEP("Peep",scan,depth);
2649 JOIN_EXACT(scan,&min,0);
2650
2651 /* Follow the next-chain of the current node and optimize
2652 away all the NOTHINGs from it. */
2653 if (OP(scan) != CURLYX) {
2654 const int max = (reg_off_by_arg[OP(scan)]
2655 ? I32_MAX
2656 /* I32 may be smaller than U16 on CRAYs! */
2657 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2658 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2659 int noff;
2660 regnode *n = scan;
2661
2662 /* Skip NOTHING and LONGJMP. */
2663 while ((n = regnext(n))
2664 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2665 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2666 && off + noff < max)
2667 off += noff;
2668 if (reg_off_by_arg[OP(scan)])
2669 ARG(scan) = off;
2670 else
2671 NEXT_OFF(scan) = off;
2672 }
a3621e74 2673
c277df42 2674
8aa23a47
YO
2675
2676 /* The principal pseudo-switch. Cannot be a switch, since we
2677 look into several different things. */
2678 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2679 || OP(scan) == IFTHEN) {
2680 next = regnext(scan);
2681 code = OP(scan);
2682 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2683
2684 if (OP(next) == code || code == IFTHEN) {
2685 /* NOTE - There is similar code to this block below for handling
2686 TRIE nodes on a re-study. If you change stuff here check there
2687 too. */
2688 I32 max1 = 0, min1 = I32_MAX, num = 0;
2689 struct regnode_charclass_class accum;
2690 regnode * const startbranch=scan;
2691
2692 if (flags & SCF_DO_SUBSTR)
304ee84b 2693 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2694 if (flags & SCF_DO_STCLASS)
2695 cl_init_zero(pRExC_state, &accum);
2696
2697 while (OP(scan) == code) {
2698 I32 deltanext, minnext, f = 0, fake;
2699 struct regnode_charclass_class this_class;
2700
2701 num++;
2702 data_fake.flags = 0;
2703 if (data) {
2704 data_fake.whilem_c = data->whilem_c;
2705 data_fake.last_closep = data->last_closep;
2706 }
2707 else
2708 data_fake.last_closep = &fake;
58e23c8d
YO
2709
2710 data_fake.pos_delta = delta;
8aa23a47
YO
2711 next = regnext(scan);
2712 scan = NEXTOPER(scan);
2713 if (code != BRANCH)
c277df42 2714 scan = NEXTOPER(scan);
8aa23a47
YO
2715 if (flags & SCF_DO_STCLASS) {
2716 cl_init(pRExC_state, &this_class);
2717 data_fake.start_class = &this_class;
2718 f = SCF_DO_STCLASS_AND;
58e23c8d 2719 }
8aa23a47
YO
2720 if (flags & SCF_WHILEM_VISITED_POS)
2721 f |= SCF_WHILEM_VISITED_POS;
2722
2723 /* we suppose the run is continuous, last=next...*/
2724 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2725 next, &data_fake,
2726 stopparen, recursed, NULL, f,depth+1);
2727 if (min1 > minnext)
2728 min1 = minnext;
2729 if (max1 < minnext + deltanext)
2730 max1 = minnext + deltanext;
2731 if (deltanext == I32_MAX)
2732 is_inf = is_inf_internal = 1;
2733 scan = next;
2734 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2735 pars++;
2736 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2737 if ( stopmin > minnext)
2738 stopmin = min + min1;
2739 flags &= ~SCF_DO_SUBSTR;
2740 if (data)
2741 data->flags |= SCF_SEEN_ACCEPT;
2742 }
2743 if (data) {
2744 if (data_fake.flags & SF_HAS_EVAL)
2745 data->flags |= SF_HAS_EVAL;
2746 data->whilem_c = data_fake.whilem_c;
3dab1dad 2747 }
8aa23a47
YO
2748 if (flags & SCF_DO_STCLASS)
2749 cl_or(pRExC_state, &accum, &this_class);
2750 }
2751 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2752 min1 = 0;
2753 if (flags & SCF_DO_SUBSTR) {
2754 data->pos_min += min1;
2755 data->pos_delta += max1 - min1;
2756 if (max1 != min1 || is_inf)
2757 data->longest = &(data->longest_float);
2758 }
2759 min += min1;
2760 delta += max1 - min1;
2761 if (flags & SCF_DO_STCLASS_OR) {
2762 cl_or(pRExC_state, data->start_class, &accum);
2763 if (min1) {
2764 cl_and(data->start_class, and_withp);
2765 flags &= ~SCF_DO_STCLASS;
653099ff 2766 }
8aa23a47
YO
2767 }
2768 else if (flags & SCF_DO_STCLASS_AND) {
2769 if (min1) {
2770 cl_and(data->start_class, &accum);
2771 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2772 }
8aa23a47
YO
2773 else {
2774 /* Switch to OR mode: cache the old value of
2775 * data->start_class */
2776 INIT_AND_WITHP;
2777 StructCopy(data->start_class, and_withp,
2778 struct regnode_charclass_class);
2779 flags &= ~SCF_DO_STCLASS_AND;
2780 StructCopy(&accum, data->start_class,
2781 struct regnode_charclass_class);
2782 flags |= SCF_DO_STCLASS_OR;
2783 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2784 }
8aa23a47 2785 }
a3621e74 2786
8aa23a47
YO
2787 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2788 /* demq.
a3621e74 2789
8aa23a47
YO
2790 Assuming this was/is a branch we are dealing with: 'scan' now
2791 points at the item that follows the branch sequence, whatever
2792 it is. We now start at the beginning of the sequence and look
2793 for subsequences of
a3621e74 2794
8aa23a47
YO
2795 BRANCH->EXACT=>x1
2796 BRANCH->EXACT=>x2
2797 tail
a3621e74 2798
8aa23a47 2799 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2800
486ec47a 2801 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2802 element into a trie and then add the subsequent branch exact
2803 strings to the trie.
a3621e74 2804
8aa23a47 2805 We have two cases
a3621e74 2806
3b753521 2807 1. patterns where the whole set of branches can be converted.
a3621e74 2808
8aa23a47 2809 2. patterns where only a subset can be converted.
a3621e74 2810
8aa23a47
YO
2811 In case 1 we can replace the whole set with a single regop
2812 for the trie. In case 2 we need to keep the start and end
3b753521 2813 branches so
a3621e74 2814
8aa23a47
YO
2815 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2816 becomes BRANCH TRIE; BRANCH X;
786e8c11 2817
8aa23a47
YO
2818 There is an additional case, that being where there is a
2819 common prefix, which gets split out into an EXACT like node
2820 preceding the TRIE node.
a3621e74 2821
8aa23a47
YO
2822 If x(1..n)==tail then we can do a simple trie, if not we make
2823 a "jump" trie, such that when we match the appropriate word
486ec47a 2824 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2825 a nested if into a case structure of sorts.
b515a41d 2826
8aa23a47
YO
2827 */
2828
2829 int made=0;
2830 if (!re_trie_maxbuff) {
2831 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2832 if (!SvIOK(re_trie_maxbuff))
2833 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2834 }
2835 if ( SvIV(re_trie_maxbuff)>=0 ) {
2836 regnode *cur;
2837 regnode *first = (regnode *)NULL;
2838 regnode *last = (regnode *)NULL;
2839 regnode *tail = scan;
2840 U8 optype = 0;
2841 U32 count=0;
a3621e74
YO
2842
2843#ifdef DEBUGGING
8aa23a47 2844 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2845#endif
8aa23a47
YO
2846 /* var tail is used because there may be a TAIL
2847 regop in the way. Ie, the exacts will point to the
2848 thing following the TAIL, but the last branch will
2849 point at the TAIL. So we advance tail. If we
2850 have nested (?:) we may have to move through several
2851 tails.
2852 */
2853
2854 while ( OP( tail ) == TAIL ) {
2855 /* this is the TAIL generated by (?:) */
2856 tail = regnext( tail );
2857 }
a3621e74 2858
8aa23a47
YO
2859
2860 DEBUG_OPTIMISE_r({
2861 regprop(RExC_rx, mysv, tail );
2862 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2863 (int)depth * 2 + 2, "",
2864 "Looking for TRIE'able sequences. Tail node is: ",
2865 SvPV_nolen_const( mysv )
2866 );
2867 });
2868
2869 /*
2870
2871 step through the branches, cur represents each
2872 branch, noper is the first thing to be matched
2873 as part of that branch and noper_next is the
2874 regnext() of that node. if noper is an EXACT
2875 and noper_next is the same as scan (our current
2876 position in the regex) then the EXACT branch is
2877 a possible optimization target. Once we have
486ec47a 2878 two or more consecutive such branches we can
8aa23a47
YO
2879 create a trie of the EXACT's contents and stich
2880 it in place. If the sequence represents all of
2881 the branches we eliminate the whole thing and
2882 replace it with a single TRIE. If it is a
2883 subsequence then we need to stitch it in. This
2884 means the first branch has to remain, and needs
2885 to be repointed at the item on the branch chain
2886 following the last branch optimized. This could
2887 be either a BRANCH, in which case the
2888 subsequence is internal, or it could be the
2889 item following the branch sequence in which
2890 case the subsequence is at the end.
2891
2892 */
2893
2894 /* dont use tail as the end marker for this traverse */
2895 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2896 regnode * const noper = NEXTOPER( cur );
b515a41d 2897#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2898 regnode * const noper_next = regnext( noper );
b515a41d
YO
2899#endif
2900
8aa23a47
YO
2901 DEBUG_OPTIMISE_r({
2902 regprop(RExC_rx, mysv, cur);
2903 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2904 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2905
2906 regprop(RExC_rx, mysv, noper);
2907 PerlIO_printf( Perl_debug_log, " -> %s",
2908 SvPV_nolen_const(mysv));
2909
2910 if ( noper_next ) {
2911 regprop(RExC_rx, mysv, noper_next );
2912 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2913 SvPV_nolen_const(mysv));
2914 }
2915 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2916 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2917 });
2918 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2919 : PL_regkind[ OP( noper ) ] == EXACT )
2920 || OP(noper) == NOTHING )
786e8c11 2921#ifdef NOJUMPTRIE
8aa23a47 2922 && noper_next == tail
786e8c11 2923#endif
8aa23a47
YO
2924 && count < U16_MAX)
2925 {
2926 count++;
2927 if ( !first || optype == NOTHING ) {
2928 if (!first) first = cur;
2929 optype = OP( noper );
2930 } else {
2931 last = cur;
2932 }
2933 } else {
a0a388a1 2934/*
0abd0d78
YO
2935 Currently we do not believe that the trie logic can
2936 handle case insensitive matching properly when the
2937 pattern is not unicode (thus forcing unicode semantics).
2938
2939 If/when this is fixed the following define can be swapped
2940 in below to fully enable trie logic.
2941
a0a388a1 2942#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2943
2944*/
2945#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2946
a0a388a1 2947 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2948 make_trie( pRExC_state,
2949 startbranch, first, cur, tail, count,
2950 optype, depth+1 );
2951 }
2952 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2953#ifdef NOJUMPTRIE
8aa23a47 2954 && noper_next == tail
786e8c11 2955#endif
8aa23a47
YO
2956 ){
2957 count = 1;
2958 first = cur;
2959 optype = OP( noper );
2960 } else {
2961 count = 0;
2962 first = NULL;
2963 optype = 0;
2964 }
2965 last = NULL;
2966 }
2967 }
2968 DEBUG_OPTIMISE_r({
2969 regprop(RExC_rx, mysv, cur);
2970 PerlIO_printf( Perl_debug_log,
2971 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2972 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2973
2974 });
a0a388a1
YO
2975
2976 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2977 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2978#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2979 if ( ((made == MADE_EXACT_TRIE &&
2980 startbranch == first)
2981 || ( first_non_open == first )) &&
2982 depth==0 ) {
2983 flags |= SCF_TRIE_RESTUDY;
2984 if ( startbranch == first
2985 && scan == tail )
2986 {
2987 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2988 }
2989 }
3dab1dad 2990#endif
8aa23a47
YO
2991 }
2992 }
2993
2994 } /* do trie */
2995
653099ff 2996 }
8aa23a47
YO
2997 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2998 scan = NEXTOPER(NEXTOPER(scan));
2999 } else /* single branch is optimized. */
3000 scan = NEXTOPER(scan);
3001 continue;
3002 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3003 scan_frame *newframe = NULL;
3004 I32 paren;
3005 regnode *start;
3006 regnode *end;
3007
3008 if (OP(scan) != SUSPEND) {
3009 /* set the pointer */
3010 if (OP(scan) == GOSUB) {
3011 paren = ARG(scan);
3012 RExC_recurse[ARG2L(scan)] = scan;
3013 start = RExC_open_parens[paren-1];
3014 end = RExC_close_parens[paren-1];
3015 } else {
3016 paren = 0;
f8fc2ecf 3017 start = RExC_rxi->program + 1;
8aa23a47
YO
3018 end = RExC_opend;
3019 }
3020 if (!recursed) {
3021 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3022 SAVEFREEPV(recursed);
3023 }
3024 if (!PAREN_TEST(recursed,paren+1)) {
3025 PAREN_SET(recursed,paren+1);
3026 Newx(newframe,1,scan_frame);
3027 } else {
3028 if (flags & SCF_DO_SUBSTR) {
304ee84b 3029 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3030 data->longest = &(data->longest_float);
3031 }
3032 is_inf = is_inf_internal = 1;
3033 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3034 cl_anything(pRExC_state, data->start_class);
3035 flags &= ~SCF_DO_STCLASS;
3036 }
3037 } else {
3038 Newx(newframe,1,scan_frame);
3039 paren = stopparen;
3040 start = scan+2;
3041 end = regnext(scan);
3042 }
3043 if (newframe) {
3044 assert(start);
3045 assert(end);
3046 SAVEFREEPV(newframe);
3047 newframe->next = regnext(scan);
3048 newframe->last = last;
3049 newframe->stop = stopparen;
3050 newframe->prev = frame;
3051
3052 frame = newframe;
3053 scan = start;
3054 stopparen = paren;
3055 last = end;
3056
3057 continue;
3058 }
3059 }
3060 else if (OP(scan) == EXACT) {
3061 I32 l = STR_LEN(scan);
3062 UV uc;
3063 if (UTF) {
3064 const U8 * const s = (U8*)STRING(scan);
3065 l = utf8_length(s, s + l);
3066 uc = utf8_to_uvchr(s, NULL);
3067 } else {
3068 uc = *((U8*)STRING(scan));
3069 }
3070 min += l;
3071 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3072 /* The code below prefers earlier match for fixed
3073 offset, later match for variable offset. */
3074 if (data->last_end == -1) { /* Update the start info. */
3075 data->last_start_min = data->pos_min;
3076 data->last_start_max = is_inf
3077 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3078 }
8aa23a47
YO
3079 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3080 if (UTF)
3081 SvUTF8_on(data->last_found);
3082 {
3083 SV * const sv = data->last_found;
3084 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3085 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3086 if (mg && mg->mg_len >= 0)
3087 mg->mg_len += utf8_length((U8*)STRING(scan),
3088 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3089 }
8aa23a47
YO
3090 data->last_end = data->pos_min + l;
3091 data->pos_min += l; /* As in the first entry. */
3092 data->flags &= ~SF_BEFORE_EOL;
3093 }
3094 if (flags & SCF_DO_STCLASS_AND) {
3095 /* Check whether it is compatible with what we know already! */
3096 int compat = 1;
3097
54251c2e 3098
486ec47a 3099 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3100 * in the bitmp and either 1) its bit or its fold is set, or 2)
3101 * it's for a locale. Even if there isn't unicode semantics
3102 * here, at runtime there may be because of matching against a
3103 * utf8 string, so accept a possible false positive for
3104 * latin1-range folds */
8aa23a47
YO
3105 if (uc >= 0x100 ||
3106 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3107 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3108 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3109 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3110 )
d18bf9dc 3111 {
8aa23a47 3112 compat = 0;
d18bf9dc 3113 }
8aa23a47
YO
3114 ANYOF_CLASS_ZERO(data->start_class);
3115 ANYOF_BITMAP_ZERO(data->start_class);
3116 if (compat)
3117 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3118 else if (uc >= 0x100) {
3119 int i;
3120
3121 /* Some Unicode code points fold to the Latin1 range; as
3122 * XXX temporary code, instead of figuring out if this is
3123 * one, just assume it is and set all the start class bits
3124 * that could be some such above 255 code point's fold
3125 * which will generate fals positives. As the code
3126 * elsewhere that does compute the fold settles down, it
3127 * can be extracted out and re-used here */
3128 for (i = 0; i < 256; i++){
3129 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3130 ANYOF_BITMAP_SET(data->start_class, i);
3131 }
3132 }
3133 }
8aa23a47
YO
3134 data->start_class->flags &= ~ANYOF_EOS;
3135 if (uc < 0x100)
3136 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3137 }
3138 else if (flags & SCF_DO_STCLASS_OR) {
3139 /* false positive possible if the class is case-folded */
3140 if (uc < 0x100)
3141 ANYOF_BITMAP_SET(data->start_class, uc);
3142 else
3143 data->start_class->flags |= ANYOF_UNICODE_ALL;
3144 data->start_class->flags &= ~ANYOF_EOS;
3145 cl_and(data->start_class, and_withp);
3146 }
3147 flags &= ~SCF_DO_STCLASS;
3148 }
3149 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3150 I32 l = STR_LEN(scan);
3151 UV uc = *((U8*)STRING(scan));
3152
3153 /* Search for fixed substrings supports EXACT only. */
3154 if (flags & SCF_DO_SUBSTR) {
3155 assert(data);
304ee84b 3156 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3157 }
3158 if (UTF) {
3159 const U8 * const s = (U8 *)STRING(scan);
3160 l = utf8_length(s, s + l);
3161 uc = utf8_to_uvchr(s, NULL);
3162 }
3163 min += l;
3164 if (flags & SCF_DO_SUBSTR)
3165 data->pos_min += l;
3166 if (flags & SCF_DO_STCLASS_AND) {
3167 /* Check whether it is compatible with what we know already! */
3168 int compat = 1;
8aa23a47 3169 if (uc >= 0x100 ||
54251c2e
KW
3170 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3171 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3172 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3173 {
8aa23a47 3174 compat = 0;
54251c2e 3175 }
8aa23a47
YO
3176 ANYOF_CLASS_ZERO(data->start_class);
3177 ANYOF_BITMAP_ZERO(data->start_class);
3178 if (compat) {
3179 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3180 data->start_class->flags &= ~ANYOF_EOS;
39065660 3181 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3182 if (OP(scan) == EXACTFL) {
8aa23a47 3183 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3184 }
3185 else {
3186
54251c2e
KW
3187 /* Also set the other member of the fold pair. In case
3188 * that unicode semantics is called for at runtime, use
3189 * the full latin1 fold. (Can't do this for locale,
3190 * because not known until runtime */
3191 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3192 }
653099ff 3193 }
d18bf9dc
KW
3194 else if (uc >= 0x100) {
3195 int i;
3196 for (i = 0; i < 256; i++){
3197 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3198 ANYOF_BITMAP_SET(data->start_class, i);
3199 }
3200 }
3201 }
8aa23a47
YO
3202 }
3203 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3204 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3205 /* false positive possible if the class is case-folded.
3206 Assume that the locale settings are the same... */
970c8436 3207 if (uc < 0x100) {
1aa99e6b 3208 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3209 if (OP(scan) != EXACTFL) {
3210
3211 /* And set the other member of the fold pair, but
3212 * can't do that in locale because not known until
3213 * run-time */
3214 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3215 PL_fold_latin1[uc]);
970c8436
KW
3216 }
3217 }
653099ff
GS
3218 data->start_class->flags &= ~ANYOF_EOS;
3219 }
8aa23a47 3220 cl_and(data->start_class, and_withp);
653099ff 3221 }
8aa23a47
YO
3222 flags &= ~SCF_DO_STCLASS;
3223 }
e52fc539 3224 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3225 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3226 I32 f = flags, pos_before = 0;
3227 regnode * const oscan = scan;
3228 struct regnode_charclass_class this_class;
3229 struct regnode_charclass_class *oclass = NULL;
3230 I32 next_is_eval = 0;
3231
3232 switch (PL_regkind[OP(scan)]) {
3233 case WHILEM: /* End of (?:...)* . */
3234 scan = NEXTOPER(scan);
3235 goto finish;
3236 case PLUS:
3237 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3238 next = NEXTOPER(scan);
3239 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3240 mincount = 1;
3241 maxcount = REG_INFTY;
3242 next = regnext(scan);
3243 scan = NEXTOPER(scan);
3244 goto do_curly;
3245 }
3246 }
3247 if (flags & SCF_DO_SUBSTR)
3248 data->pos_min++;
3249 min++;
3250 /* Fall through. */
3251 case STAR:
3252 if (flags & SCF_DO_STCLASS) {
3253 mincount = 0;
3254 maxcount = REG_INFTY;
3255 next = regnext(scan);
3256 scan = NEXTOPER(scan);
3257 goto do_curly;
3258 }
3259 is_inf = is_inf_internal = 1;
3260 scan = regnext(scan);
c277df42 3261 if (flags & SCF_DO_SUBSTR) {
304ee84b 3262 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3263 data->longest = &(data->longest_float);
c277df42 3264 }
8aa23a47
YO
3265 goto optimize_curly_tail;
3266 case CURLY:
3267 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3268 && (scan->flags == stopparen))
3269 {
3270 mincount = 1;
3271 maxcount = 1;
3272 } else {
3273 mincount = ARG1(scan);
3274 maxcount = ARG2(scan);
653099ff 3275 }
8aa23a47
YO
3276 next = regnext(scan);
3277 if (OP(scan) == CURLYX) {
3278 I32 lp = (data ? *(data->last_closep) : 0);
3279 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3280 }
8aa23a47
YO
3281 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3282 next_is_eval = (OP(scan) == EVAL);
3283 do_curly:
3284 if (flags & SCF_DO_SUBSTR) {
304ee84b 3285 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3286 pos_before = data->pos_min;
b45f050a 3287 }
8aa23a47
YO
3288 if (data) {
3289 fl = data->flags;
3290 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3291 if (is_inf)
3292 data->flags |= SF_IS_INF;
3293 }
3294 if (flags & SCF_DO_STCLASS) {
3295 cl_init(pRExC_state, &this_class);
3296 oclass = data->start_class;
3297 data->start_class = &this_class;
3298 f |= SCF_DO_STCLASS_AND;
3299 f &= ~SCF_DO_STCLASS_OR;
3300 }
779bcb7d
NC
3301 /* Exclude from super-linear cache processing any {n,m}
3302 regops for which the combination of input pos and regex
3303 pos is not enough information to determine if a match
3304 will be possible.
3305
3306 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3307 regex pos at the \s*, the prospects for a match depend not
3308 only on the input position but also on how many (bar\s*)
3309 repeats into the {4,8} we are. */
3310 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3311 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3312
8aa23a47
YO
3313 /* This will finish on WHILEM, setting scan, or on NULL: */
3314 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3315 last, data, stopparen, recursed, NULL,
3316 (mincount == 0
3317 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3318
8aa23a47
YO
3319 if (flags & SCF_DO_STCLASS)
3320 data->start_class = oclass;
3321 if (mincount == 0 || minnext == 0) {
3322 if (flags & SCF_DO_STCLASS_OR) {
3323 cl_or(pRExC_state, data->start_class, &this_class);
3324 }
3325 else if (flags & SCF_DO_STCLASS_AND) {
3326 /* Switch to OR mode: cache the old value of
3327 * data->start_class */
3328 INIT_AND_WITHP;
3329 StructCopy(data->start_class, and_withp,
3330 struct regnode_charclass_class);
3331 flags &= ~SCF_DO_STCLASS_AND;
3332 StructCopy(&this_class, data->start_class,
3333 struct regnode_charclass_class);
3334 flags |= SCF_DO_STCLASS_OR;
3335 data->start_class->flags |= ANYOF_EOS;
3336 }
3337 } else { /* Non-zero len */
3338 if (flags & SCF_DO_STCLASS_OR) {
3339 cl_or(pRExC_state, data->start_class, &this_class);
3340 cl_and(data->start_class, and_withp);
3341 }
3342 else if (flags & SCF_DO_STCLASS_AND)
3343 cl_and(data->start_class, &this_class);
3344 flags &= ~SCF_DO_STCLASS;
3345 }
3346 if (!scan) /* It was not CURLYX, but CURLY. */
3347 scan = next;
3348 if ( /* ? quantifier ok, except for (?{ ... }) */
3349 (next_is_eval || !(mincount == 0 && maxcount == 1))
3350 && (minnext == 0) && (deltanext == 0)
3351 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3352 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3353 {
668c081a
NC
3354 ckWARNreg(RExC_parse,
3355 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3356 }
3357
3358 min += minnext * mincount;
3359 is_inf_internal |= ((maxcount == REG_INFTY
3360 && (minnext + deltanext) > 0)
3361 || deltanext == I32_MAX);
3362 is_inf |= is_inf_internal;
3363 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3364
3365 /* Try powerful optimization CURLYX => CURLYN. */
3366 if ( OP(oscan) == CURLYX && data
3367 && data->flags & SF_IN_PAR
3368 && !(data->flags & SF_HAS_EVAL)
3369 && !deltanext && minnext == 1 ) {
3370 /* Try to optimize to CURLYN. */
3371 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3372 regnode * const nxt1 = nxt;
497b47a8 3373#ifdef DEBUGGING
8aa23a47 3374 regnode *nxt2;
497b47a8 3375#endif
c277df42 3376
8aa23a47
YO
3377 /* Skip open. */
3378 nxt = regnext(nxt);
e52fc539 3379 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3380 && !(PL_regkind[OP(nxt)] == EXACT
3381 && STR_LEN(nxt) == 1))
3382 goto nogo;
497b47a8 3383#ifdef DEBUGGING
8aa23a47 3384 nxt2 = nxt;
497b47a8 3385#endif
8aa23a47
YO
3386 nxt = regnext(nxt);
3387 if (OP(nxt) != CLOSE)
3388 goto nogo;
3389 if (RExC_open_parens) {
3390 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3391 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3392 }
3393 /* Now we know that nxt2 is the only contents: */
3394 oscan->flags = (U8)ARG(nxt);
3395 OP(oscan) = CURLYN;
3396 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3397
c277df42 3398#ifdef DEBUGGING
8aa23a47 3399 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3400 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3401 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3402 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3403 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3404 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3405#endif
8aa23a47
YO
3406 }
3407 nogo:
3408
3409 /* Try optimization CURLYX => CURLYM. */
3410 if ( OP(oscan) == CURLYX && data
3411 && !(data->flags & SF_HAS_PAR)
3412 && !(data->flags & SF_HAS_EVAL)
3413 && !deltanext /* atom is fixed width */
3414 && minnext != 0 /* CURLYM can't handle zero width */
3415 ) {
3416 /* XXXX How to optimize if data == 0? */
3417 /* Optimize to a simpler form. */
3418 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3419 regnode *nxt2;
3420
3421 OP(oscan) = CURLYM;
3422 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3423 && (OP(nxt2) != WHILEM))
3424 nxt = nxt2;
3425 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3426 /* Need to optimize away parenths. */
b3c0965f 3427 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3428 /* Set the parenth number. */
3429 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3430
8aa23a47
YO
3431 oscan->flags = (U8)ARG(nxt);
3432 if (RExC_open_parens) {
3433 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3434 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3435 }
8aa23a47
YO
3436 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3437 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3438
c277df42 3439#ifdef DEBUGGING
8aa23a47
YO
3440 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3441 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3442 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3443 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3444#endif
c277df42 3445#if 0
8aa23a47
YO
3446 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3447 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3448 if (nnxt == nxt) {
3449 if (reg_off_by_arg[OP(nxt1)])
3450 ARG_SET(nxt1, nxt2 - nxt1);
3451 else if (nxt2 - nxt1 < U16_MAX)
3452 NEXT_OFF(nxt1) = nxt2 - nxt1;
3453 else
3454 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3455 }
8aa23a47 3456 nxt1 = nnxt;
c277df42 3457 }
5d1c421c 3458#endif
8aa23a47
YO
3459 /* Optimize again: */
3460 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3461 NULL, stopparen, recursed, NULL, 0,depth+1);
3462 }
3463 else
3464 oscan->flags = 0;
3465 }
3466 else if ((OP(oscan) == CURLYX)
3467 && (flags & SCF_WHILEM_VISITED_POS)
3468 /* See the comment on a similar expression above.
3b753521 3469 However, this time it's not a subexpression
8aa23a47
YO
3470 we care about, but the expression itself. */
3471 && (maxcount == REG_INFTY)
3472 && data && ++data->whilem_c < 16) {
3473 /* This stays as CURLYX, we can put the count/of pair. */
3474 /* Find WHILEM (as in regexec.c) */
3475 regnode *nxt = oscan + NEXT_OFF(oscan);
3476
3477 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3478 nxt += ARG(nxt);
3479 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3480 | (RExC_whilem_seen << 4)); /* On WHILEM */
3481 }
3482 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3483 pars++;
3484 if (flags & SCF_DO_SUBSTR) {
3485 SV *last_str = NULL;
3486 int counted = mincount != 0;
a0ed51b3 3487
8aa23a47
YO
3488 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3489#if defined(SPARC64_GCC_WORKAROUND)
3490 I32 b = 0;
3491 STRLEN l = 0;
3492 const char *s = NULL;
3493 I32 old = 0;
b515a41d 3494
8aa23a47
YO
3495 if (pos_before >= data->last_start_min)
3496 b = pos_before;
3497 else
3498 b = data->last_start_min;
b515a41d 3499
8aa23a47
YO
3500 l = 0;
3501 s = SvPV_const(data->last_found, l);
3502 old = b - data->last_start_min;
3503
3504#else
3505 I32 b = pos_before >= data->last_start_min
3506 ? pos_before : data->last_start_min;
3507 STRLEN l;
3508 const char * const s = SvPV_const(data->last_found, l);
3509 I32 old = b - data->last_start_min;
3510#endif
3511
3512 if (UTF)
3513 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3514 l -= old;
3515 /* Get the added string: */
740cce10 3516 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3517 if (deltanext == 0 && pos_before == b) {
3518 /* What was added is a constant string */
3519 if (mincount > 1) {
3520 SvGROW(last_str, (mincount * l) + 1);
3521 repeatcpy(SvPVX(last_str) + l,
3522 SvPVX_const(last_str), l, mincount - 1);
3523 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3524 /* Add additional parts. */
3525 SvCUR_set(data->last_found,
3526 SvCUR(data->last_found) - l);
3527 sv_catsv(data->last_found, last_str);
3528 {
3529 SV * sv = data->last_found;
3530 MAGIC *mg =
3531 SvUTF8(sv) && SvMAGICAL(sv) ?
3532 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3533 if (mg && mg->mg_len >= 0)
bd94e887 3534 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3535 }
8aa23a47 3536 data->last_end += l * (mincount - 1);
b515a41d 3537 }
8aa23a47
YO
3538 } else {
3539 /* start offset must point into the last copy */
3540 data->last_start_min += minnext * (mincount - 1);
3541 data->last_start_max += is_inf ? I32_MAX
3542 : (maxcount - 1) * (minnext + data->pos_delta);
3543 }
c277df42 3544 }
8aa23a47
YO
3545 /* It is counted once already... */
3546 data->pos_min += minnext * (mincount - counted);
3547 data->pos_delta += - counted * deltanext +
3548 (minnext + deltanext) * maxcount - minnext * mincount;
3549 if (mincount != maxcount) {
3550 /* Cannot extend fixed substrings found inside
3551 the group. */
304ee84b 3552 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3553 if (mincount && last_str) {
3554 SV * const sv = data->last_found;
3555 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3556 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3557
3558 if (mg)
3559 mg->mg_len = -1;
3560 sv_setsv(sv, last_str);
3561 data->last_end = data->pos_min;
3562 data->last_start_min =
3563 data->pos_min - CHR_SVLEN(last_str);
3564 data->last_start_max = is_inf
3565 ? I32_MAX
3566 : data->pos_min + data->pos_delta
3567 - CHR_SVLEN(last_str);
3568 }
3569 data->longest = &(data->longest_float);
3570 }
3571 SvREFCNT_dec(last_str);
c277df42 3572 }
8aa23a47
YO
3573 if (data && (fl & SF_HAS_EVAL))
3574 data->flags |= SF_HAS_EVAL;
3575 optimize_curly_tail:
3576 if (OP(oscan) != CURLYX) {
3577 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3578 && NEXT_OFF(next))
3579 NEXT_OFF(oscan) += NEXT_OFF(next);
3580 }
3581 continue;
f56b6394 3582 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3583 if (flags & SCF_DO_SUBSTR) {
304ee84b 3584 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3585 data->longest = &(data->longest_float);
3586 }
3587 is_inf = is_inf_internal = 1;
3588 if (flags & SCF_DO_STCLASS_OR)
3589 cl_anything(pRExC_state, data->start_class);
3590 flags &= ~SCF_DO_STCLASS;
3591 break;
c277df42 3592 }
8aa23a47 3593 }
e1d1eefb
YO
3594 else if (OP(scan) == LNBREAK) {
3595 if (flags & SCF_DO_STCLASS) {
3596 int value = 0;
3597 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3598 if (flags & SCF_DO_STCLASS_AND) {
3599 for (value = 0; value < 256; value++)
e64b1bd1 3600 if (!is_VERTWS_cp(value))
b9a59e08
KW
3601 ANYOF_BITMAP_CLEAR(data->start_class, value);
3602 }
3603 else {
e1d1eefb 3604 for (value = 0; value < 256; value++)
e64b1bd1 3605 if (is_VERTWS_cp(value))
b9a59e08
KW
3606 ANYOF_BITMAP_SET(data->start_class, value);
3607 }
e1d1eefb
YO
3608 if (flags & SCF_DO_STCLASS_OR)
3609 cl_and(data->start_class, and_withp);
3610 flags &= ~SCF_DO_STCLASS;
3611 }
3612 min += 1;
f9a79580 3613 delta += 1;
e1d1eefb
YO
3614 if (flags & SCF_DO_SUBSTR) {
3615 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3616 data->pos_min += 1;
f9a79580 3617 data->pos_delta += 1;
e1d1eefb
YO
3618 data->longest = &(data->longest_float);
3619 }
e1d1eefb 3620 }
f9a79580 3621 else if (OP(scan) == FOLDCHAR) {
ced7f090 3622 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3623 flags &= ~SCF_DO_STCLASS;
3624 min += 1;
3625 delta += d;
3626 if (flags & SCF_DO_SUBSTR) {
3627 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3628 data->pos_min += 1;
3629 data->pos_delta += d;
3630 data->longest = &(data->longest_float);
3631 }
3632 }
e52fc539 3633 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3634 int value = 0;
653099ff 3635
8aa23a47 3636 if (flags & SCF_DO_SUBSTR) {
304ee84b 3637 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3638 data->pos_min++;
3639 }
3640 min++;
3641 if (flags & SCF_DO_STCLASS) {
3642 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3643
8aa23a47
YO
3644 /* Some of the logic below assumes that switching
3645 locale on will only add false positives. */
3646 switch (PL_regkind[OP(scan)]) {
3647 case SANY:
3648 default:
3649 do_default:
3650 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3651 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3652 cl_anything(pRExC_state, data->start_class);
3653 break;
3654 case REG_ANY:
3655 if (OP(scan) == SANY)
3656 goto do_default;
3657 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3658 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3659 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3660 cl_anything(pRExC_state, data->start_class);
653099ff 3661 }
8aa23a47
YO
3662 if (flags & SCF_DO_STCLASS_AND || !value)
3663 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3664 break;
3665 case ANYOF:
3666 if (flags & SCF_DO_STCLASS_AND)
3667 cl_and(data->start_class,
3668 (struct regnode_charclass_class*)scan);
653099ff 3669 else
8aa23a47
YO
3670 cl_or(pRExC_state, data->start_class,
3671 (struct regnode_charclass_class*)scan);
3672 break;
3673 case ALNUM:
3674 if (flags & SCF_DO_STCLASS_AND) {
3675 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 3677 if (OP(scan) == ALNUMU) {
a12cf05f
KW
3678 for (value = 0; value < 256; value++) {
3679 if (!isWORDCHAR_L1(value)) {
3680 ANYOF_BITMAP_CLEAR(data->start_class, value);
3681 }
3682 }
3683