This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Move statement down.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
4624b182 145 I32 contains_locale;
830247a4
IZ
146#if ADD_TO_REGEXEC
147 char *starttry; /* -Dr: where regtry was called. */
148#define RExC_starttry (pRExC_state->starttry)
149#endif
3dab1dad 150#ifdef DEBUGGING
be8e71aa 151 const char *lastparse;
3dab1dad 152 I32 lastnum;
1f1031fe 153 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
154#define RExC_lastparse (pRExC_state->lastparse)
155#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 156#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 157#endif
830247a4
IZ
158} RExC_state_t;
159
e2509266 160#define RExC_flags (pRExC_state->flags)
830247a4 161#define RExC_precomp (pRExC_state->precomp)
288b8c02 162#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 163#define RExC_rx (pRExC_state->rx)
f8fc2ecf 164#define RExC_rxi (pRExC_state->rxi)
fac92740 165#define RExC_start (pRExC_state->start)
830247a4
IZ
166#define RExC_end (pRExC_state->end)
167#define RExC_parse (pRExC_state->parse)
168#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
169#ifdef RE_TRACK_PATTERN_OFFSETS
170#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
171#endif
830247a4 172#define RExC_emit (pRExC_state->emit)
fac92740 173#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 174#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
175#define RExC_naughty (pRExC_state->naughty)
176#define RExC_sawback (pRExC_state->sawback)
177#define RExC_seen (pRExC_state->seen)
178#define RExC_size (pRExC_state->size)
179#define RExC_npar (pRExC_state->npar)
e2e6a0f1 180#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
181#define RExC_extralen (pRExC_state->extralen)
182#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
183#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 184#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 185#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 186#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
187#define RExC_open_parens (pRExC_state->open_parens)
188#define RExC_close_parens (pRExC_state->close_parens)
189#define RExC_opend (pRExC_state->opend)
81714fb9 190#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
191#define RExC_recurse (pRExC_state->recurse)
192#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 193#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 194#define RExC_contains_locale (pRExC_state->contains_locale)
830247a4 195
cde0cee5 196
a687059c
LW
197#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
198#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
199 ((*s) == '{' && regcurly(s)))
a687059c 200
35c8bce7
LW
201#ifdef SPSTART
202#undef SPSTART /* dratted cpp namespace... */
203#endif
a687059c
LW
204/*
205 * Flags to be passed up and down.
206 */
a687059c 207#define WORST 0 /* Worst case. */
a3b492c3 208#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
209
210/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 211 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 212#define SIMPLE 0x02
a3b492c3
YO
213#define SPSTART 0x04 /* Starts with * or +. */
214#define TRYAGAIN 0x08 /* Weeded out a declaration. */
215#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 216
3dab1dad
YO
217#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
218
07be1b83
YO
219/* whether trie related optimizations are enabled */
220#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
221#define TRIE_STUDY_OPT
786e8c11 222#define FULL_TRIE_STUDY
07be1b83
YO
223#define TRIE_STCLASS
224#endif
1de06328
YO
225
226
40d049e4
YO
227
228#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
229#define PBITVAL(paren) (1 << ((paren) & 7))
230#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
231#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
232#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
233
bbd61b5f
KW
234/* If not already in utf8, do a longjmp back to the beginning */
235#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
236#define REQUIRE_UTF8 STMT_START { \
237 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
238 } STMT_END
40d049e4 239
1de06328
YO
240/* About scan_data_t.
241
242 During optimisation we recurse through the regexp program performing
243 various inplace (keyhole style) optimisations. In addition study_chunk
244 and scan_commit populate this data structure with information about
245 what strings MUST appear in the pattern. We look for the longest
3b753521 246 string that must appear at a fixed location, and we look for the
1de06328
YO
247 longest string that may appear at a floating location. So for instance
248 in the pattern:
249
250 /FOO[xX]A.*B[xX]BAR/
251
252 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
253 strings (because they follow a .* construct). study_chunk will identify
254 both FOO and BAR as being the longest fixed and floating strings respectively.
255
256 The strings can be composites, for instance
257
258 /(f)(o)(o)/
259
260 will result in a composite fixed substring 'foo'.
261
262 For each string some basic information is maintained:
263
264 - offset or min_offset
265 This is the position the string must appear at, or not before.
266 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
267 characters must match before the string we are searching for.
268 Likewise when combined with minlenp and the length of the string it
1de06328
YO
269 tells us how many characters must appear after the string we have
270 found.
271
272 - max_offset
273 Only used for floating strings. This is the rightmost point that
3b753521 274 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
275 string can occur infinitely far to the right.
276
277 - minlenp
278 A pointer to the minimum length of the pattern that the string
279 was found inside. This is important as in the case of positive
280 lookahead or positive lookbehind we can have multiple patterns
281 involved. Consider
282
283 /(?=FOO).*F/
284
285 The minimum length of the pattern overall is 3, the minimum length
286 of the lookahead part is 3, but the minimum length of the part that
287 will actually match is 1. So 'FOO's minimum length is 3, but the
288 minimum length for the F is 1. This is important as the minimum length
289 is used to determine offsets in front of and behind the string being
290 looked for. Since strings can be composites this is the length of the
486ec47a 291 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
292 the length is calculated by study_chunk, so that the minimum lengths
293 are not known until the full pattern has been compiled, thus the
294 pointer to the value.
295
296 - lookbehind
297
298 In the case of lookbehind the string being searched for can be
299 offset past the start point of the final matching string.
300 If this value was just blithely removed from the min_offset it would
301 invalidate some of the calculations for how many chars must match
302 before or after (as they are derived from min_offset and minlen and
303 the length of the string being searched for).
304 When the final pattern is compiled and the data is moved from the
305 scan_data_t structure into the regexp structure the information
306 about lookbehind is factored in, with the information that would
307 have been lost precalculated in the end_shift field for the
308 associated string.
309
310 The fields pos_min and pos_delta are used to store the minimum offset
311 and the delta to the maximum offset at the current point in the pattern.
312
313*/
2c2d71f5
JH
314
315typedef struct scan_data_t {
1de06328
YO
316 /*I32 len_min; unused */
317 /*I32 len_delta; unused */
2c2d71f5
JH
318 I32 pos_min;
319 I32 pos_delta;
320 SV *last_found;
1de06328 321 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
322 I32 last_start_min;
323 I32 last_start_max;
1de06328
YO
324 SV **longest; /* Either &l_fixed, or &l_float. */
325 SV *longest_fixed; /* longest fixed string found in pattern */
326 I32 offset_fixed; /* offset where it starts */
486ec47a 327 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
328 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
329 SV *longest_float; /* longest floating string found in pattern */
330 I32 offset_float_min; /* earliest point in string it can appear */
331 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 332 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 333 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
334 I32 flags;
335 I32 whilem_c;
cb434fcc 336 I32 *last_closep;
653099ff 337 struct regnode_charclass_class *start_class;
2c2d71f5
JH
338} scan_data_t;
339
a687059c 340/*
e50aee73 341 * Forward declarations for pregcomp()'s friends.
a687059c 342 */
a0d0e21e 343
27da23d5 344static const scan_data_t zero_scan_data =
1de06328 345 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
346
347#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
348#define SF_BEFORE_SEOL 0x0001
349#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
350#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
351#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
352
09b7f37c
CB
353#ifdef NO_UNARY_PLUS
354# define SF_FIX_SHIFT_EOL (0+2)
355# define SF_FL_SHIFT_EOL (0+4)
356#else
357# define SF_FIX_SHIFT_EOL (+2)
358# define SF_FL_SHIFT_EOL (+4)
359#endif
c277df42
IZ
360
361#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
362#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
363
364#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
365#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
366#define SF_IS_INF 0x0040
367#define SF_HAS_PAR 0x0080
368#define SF_IN_PAR 0x0100
369#define SF_HAS_EVAL 0x0200
370#define SCF_DO_SUBSTR 0x0400
653099ff
GS
371#define SCF_DO_STCLASS_AND 0x0800
372#define SCF_DO_STCLASS_OR 0x1000
373#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 374#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 375
786e8c11 376#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 377#define SCF_SEEN_ACCEPT 0x8000
07be1b83 378
43fead97 379#define UTF cBOOL(RExC_utf8)
a62b1201
KW
380#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
381#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
382#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
383#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
384#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
385#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
386#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 387
43fead97 388#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 389
ffc61ed2 390#define OOB_UNICODE 12345678
93733859 391#define OOB_NAMEDCLASS -1
b8c5462f 392
a0ed51b3
LW
393#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
394#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
395
8615cb43 396
b45f050a
JF
397/* length of regex to show in messages that don't mark a position within */
398#define RegexLengthToShowInErrorMessages 127
399
400/*
401 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
402 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
403 * op/pragma/warn/regcomp.
404 */
7253e4e3
RK
405#define MARKER1 "<-- HERE" /* marker as it appears in the description */
406#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 407
7253e4e3 408#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
409
410/*
411 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
412 * arg. Show regex, up to a maximum length. If it's too long, chop and add
413 * "...".
414 */
58e23c8d 415#define _FAIL(code) STMT_START { \
bfed75c6 416 const char *ellipses = ""; \
ccb2c380
MP
417 IV len = RExC_end - RExC_precomp; \
418 \
419 if (!SIZE_ONLY) \
288b8c02 420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
421 if (len > RegexLengthToShowInErrorMessages) { \
422 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
423 len = RegexLengthToShowInErrorMessages - 10; \
424 ellipses = "..."; \
425 } \
58e23c8d 426 code; \
ccb2c380 427} STMT_END
8615cb43 428
58e23c8d
YO
429#define FAIL(msg) _FAIL( \
430 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
431 msg, (int)len, RExC_precomp, ellipses))
432
433#define FAIL2(msg,arg) _FAIL( \
434 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
435 arg, (int)len, RExC_precomp, ellipses))
436
b45f050a 437/*
b45f050a
JF
438 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
439 */
ccb2c380 440#define Simple_vFAIL(m) STMT_START { \
a28509cc 441 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
442 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
443 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
b45f050a
JF
445
446/*
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
448 */
ccb2c380
MP
449#define vFAIL(m) STMT_START { \
450 if (!SIZE_ONLY) \
288b8c02 451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
452 Simple_vFAIL(m); \
453} STMT_END
b45f050a
JF
454
455/*
456 * Like Simple_vFAIL(), but accepts two arguments.
457 */
ccb2c380 458#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 459 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
b45f050a
JF
463
464/*
465 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
466 */
ccb2c380
MP
467#define vFAIL2(m,a1) STMT_START { \
468 if (!SIZE_ONLY) \
288b8c02 469 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
470 Simple_vFAIL2(m, a1); \
471} STMT_END
b45f050a
JF
472
473
474/*
475 * Like Simple_vFAIL(), but accepts three arguments.
476 */
ccb2c380 477#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 478 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
479 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
480 (int)offset, RExC_precomp, RExC_precomp + offset); \
481} STMT_END
b45f050a
JF
482
483/*
484 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
485 */
ccb2c380
MP
486#define vFAIL3(m,a1,a2) STMT_START { \
487 if (!SIZE_ONLY) \
288b8c02 488 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
489 Simple_vFAIL3(m, a1, a2); \
490} STMT_END
b45f050a
JF
491
492/*
493 * Like Simple_vFAIL(), but accepts four arguments.
494 */
ccb2c380 495#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 496 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
497 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
498 (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
b45f050a 500
668c081a 501#define ckWARNreg(loc,m) STMT_START { \
a28509cc 502 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
505} STMT_END
506
668c081a 507#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
d1d15184 509 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
510 m REPORT_LOCATION, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
512} STMT_END
513
2335b3d3
KW
514#define ckWARN2regdep(loc,m, a1) STMT_START { \
515 const IV offset = loc - RExC_precomp; \
516 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
517 m REPORT_LOCATION, \
518 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
519} STMT_END
520
668c081a 521#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 522 const IV offset = loc - RExC_precomp; \
668c081a 523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
524 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
525} STMT_END
526
527#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 528 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
529 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
530 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
531} STMT_END
532
668c081a
NC
533#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
537} STMT_END
538
ccb2c380 539#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 540 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
542 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
543} STMT_END
544
668c081a
NC
545#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
549} STMT_END
550
ccb2c380 551#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 552 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
553 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
554 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
555} STMT_END
9d1d55b5 556
8615cb43 557
cd439c50 558/* Allow for side effects in s */
ccb2c380
MP
559#define REGC(c,s) STMT_START { \
560 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
561} STMT_END
cd439c50 562
fac92740
MJD
563/* Macros for recording node offsets. 20001227 mjd@plover.com
564 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
565 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
566 * Element 0 holds the number n.
07be1b83 567 * Position is 1 indexed.
fac92740 568 */
7122b237
YO
569#ifndef RE_TRACK_PATTERN_OFFSETS
570#define Set_Node_Offset_To_R(node,byte)
571#define Set_Node_Offset(node,byte)
572#define Set_Cur_Node_Offset
573#define Set_Node_Length_To_R(node,len)
574#define Set_Node_Length(node,len)
575#define Set_Node_Cur_Length(node)
576#define Node_Offset(n)
577#define Node_Length(n)
578#define Set_Node_Offset_Length(node,offset,len)
579#define ProgLen(ri) ri->u.proglen
580#define SetProgLen(ri,x) ri->u.proglen = x
581#else
582#define ProgLen(ri) ri->u.offsets[0]
583#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
584#define Set_Node_Offset_To_R(node,byte) STMT_START { \
585 if (! SIZE_ONLY) { \
586 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 587 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 588 if((node) < 0) { \
551405c4 589 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
590 } else { \
591 RExC_offsets[2*(node)-1] = (byte); \
592 } \
593 } \
594} STMT_END
595
596#define Set_Node_Offset(node,byte) \
597 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
598#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
599
600#define Set_Node_Length_To_R(node,len) STMT_START { \
601 if (! SIZE_ONLY) { \
602 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 603 __LINE__, (int)(node), (int)(len))); \
ccb2c380 604 if((node) < 0) { \
551405c4 605 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
606 } else { \
607 RExC_offsets[2*(node)] = (len); \
608 } \
609 } \
610} STMT_END
611
612#define Set_Node_Length(node,len) \
613 Set_Node_Length_To_R((node)-RExC_emit_start, len)
614#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
615#define Set_Node_Cur_Length(node) \
616 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
617
618/* Get offsets and lengths */
619#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
620#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
621
07be1b83
YO
622#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
623 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
624 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
625} STMT_END
7122b237 626#endif
07be1b83
YO
627
628#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
629#define EXPERIMENTAL_INPLACESCAN
f427392e 630#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 631
304ee84b
YO
632#define DEBUG_STUDYDATA(str,data,depth) \
633DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 634 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
635 "%*s" str "Pos:%"IVdf"/%"IVdf \
636 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
637 (int)(depth)*2, "", \
638 (IV)((data)->pos_min), \
639 (IV)((data)->pos_delta), \
304ee84b 640 (UV)((data)->flags), \
1de06328 641 (IV)((data)->whilem_c), \
304ee84b
YO
642 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
643 is_inf ? "INF " : "" \
1de06328
YO
644 ); \
645 if ((data)->last_found) \
646 PerlIO_printf(Perl_debug_log, \
647 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
648 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
649 SvPVX_const((data)->last_found), \
650 (IV)((data)->last_end), \
651 (IV)((data)->last_start_min), \
652 (IV)((data)->last_start_max), \
653 ((data)->longest && \
654 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
655 SvPVX_const((data)->longest_fixed), \
656 (IV)((data)->offset_fixed), \
657 ((data)->longest && \
658 (data)->longest==&((data)->longest_float)) ? "*" : "", \
659 SvPVX_const((data)->longest_float), \
660 (IV)((data)->offset_float_min), \
661 (IV)((data)->offset_float_max) \
662 ); \
663 PerlIO_printf(Perl_debug_log,"\n"); \
664});
665
acfe0abc 666static void clear_re(pTHX_ void *r);
4327152a 667
653099ff 668/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 669 Update the longest found anchored substring and the longest found
653099ff
GS
670 floating substrings if needed. */
671
4327152a 672STATIC void
304ee84b 673S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 674{
e1ec3a88
AL
675 const STRLEN l = CHR_SVLEN(data->last_found);
676 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 677 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 678
7918f24d
NC
679 PERL_ARGS_ASSERT_SCAN_COMMIT;
680
c277df42 681 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 682 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
683 if (*data->longest == data->longest_fixed) {
684 data->offset_fixed = l ? data->last_start_min : data->pos_min;
685 if (data->flags & SF_BEFORE_EOL)
b81d288d 686 data->flags
c277df42
IZ
687 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
688 else
689 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
690 data->minlen_fixed=minlenp;
691 data->lookbehind_fixed=0;
a0ed51b3 692 }
304ee84b 693 else { /* *data->longest == data->longest_float */
c277df42 694 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
695 data->offset_float_max = (l
696 ? data->last_start_max
c277df42 697 : data->pos_min + data->pos_delta);
304ee84b 698 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 699 data->offset_float_max = I32_MAX;
c277df42 700 if (data->flags & SF_BEFORE_EOL)
b81d288d 701 data->flags
c277df42
IZ
702 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
703 else
704 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
705 data->minlen_float=minlenp;
706 data->lookbehind_float=0;
c277df42
IZ
707 }
708 }
709 SvCUR_set(data->last_found, 0);
0eda9292 710 {
a28509cc 711 SV * const sv = data->last_found;
097eb12c
AL
712 if (SvUTF8(sv) && SvMAGICAL(sv)) {
713 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
714 if (mg)
715 mg->mg_len = 0;
716 }
0eda9292 717 }
c277df42
IZ
718 data->last_end = -1;
719 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 720 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
721}
722
653099ff
GS
723/* Can match anything (initialization) */
724STATIC void
3fffb88a 725S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 726{
7918f24d
NC
727 PERL_ARGS_ASSERT_CL_ANYTHING;
728
f8bef550 729 ANYOF_BITMAP_SETALL(cl);
6b1ea9dd 730 ANYOF_CLASS_ZERO(cl); /* all bits set, so class is irrelevant */
dd58aee1
KW
731 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
732 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
3fffb88a
KW
733
734 /* If any portion of the regex is to operate under locale rules,
735 * initialization includes it. The reason this isn't done for all regexes
736 * is that the optimizer was written under the assumption that locale was
737 * all-or-nothing. Given the complexity and lack of documentation in the
738 * optimizer, and that there are inadequate test cases for locale, so many
739 * parts of it may not work properly, it is safest to avoid locale unless
740 * necessary. */
741 if (RExC_contains_locale) {
742 cl->flags |= ANYOF_LOCALE;
743 }
653099ff
GS
744}
745
746/* Can match anything (initialization) */
747STATIC int
5f66b61c 748S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
749{
750 int value;
751
7918f24d
NC
752 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
753
aaa51d5e 754 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
755 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
756 return 1;
1aa99e6b
IH
757 if (!(cl->flags & ANYOF_UNICODE_ALL))
758 return 0;
10edeb5d 759 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 760 return 0;
653099ff
GS
761 return 1;
762}
763
764/* Can match anything (initialization) */
765STATIC void
e755fd73 766S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 767{
7918f24d
NC
768 PERL_ARGS_ASSERT_CL_INIT;
769
8ecf7187 770 Zero(cl, 1, struct regnode_charclass_class);
653099ff 771 cl->type = ANYOF;
3fffb88a 772 cl_anything(pRExC_state, cl);
1411dba4 773 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
653099ff
GS
774}
775
1051e1c4
KW
776/* These two functions currently do the exact same thing */
777#define cl_init_zero S_cl_init
653099ff 778
dd58aee1
KW
779/* 'AND' a given class with another one. Can create false positives. 'cl'
780 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
781 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 782STATIC void
5f66b61c 783S_cl_and(struct regnode_charclass_class *cl,
a28509cc 784 const struct regnode_charclass_class *and_with)
653099ff 785{
7918f24d 786 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
787
788 assert(and_with->type == ANYOF);
1e6ade67 789
c6b76537 790 /* I (khw) am not sure all these restrictions are necessary XXX */
1e6ade67
KW
791 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
792 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 793 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
794 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
795 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
796 int i;
797
798 if (and_with->flags & ANYOF_INVERT)
799 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
800 cl->bitmap[i] &= ~and_with->bitmap[i];
801 else
802 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
803 cl->bitmap[i] &= and_with->bitmap[i];
804 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 805
c6b76537 806 if (and_with->flags & ANYOF_INVERT) {
8951c461 807
c6b76537
KW
808 /* Here, the and'ed node is inverted. Get the AND of the flags that
809 * aren't affected by the inversion. Those that are affected are
810 * handled individually below */
811 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
812 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
813 cl->flags |= affected_flags;
814
815 /* We currently don't know how to deal with things that aren't in the
816 * bitmap, but we know that the intersection is no greater than what
817 * is already in cl, so let there be false positives that get sorted
818 * out after the synthetic start class succeeds, and the node is
819 * matched for real. */
820
821 /* The inversion of these two flags indicate that the resulting
822 * intersection doesn't have them */
823 if (and_with->flags & ANYOF_UNICODE_ALL) {
4713bfe1
KW
824 cl->flags &= ~ANYOF_UNICODE_ALL;
825 }
c6b76537
KW
826 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
827 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 828 }
1aa99e6b 829 }
c6b76537 830 else { /* and'd node is not inverted */
137165a6 831 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
832
833 /* Here 'and_with' doesn't match anything outside the bitmap
834 * (except possibly ANYOF_UNICODE_ALL), which means the
835 * intersection can't either, except for ANYOF_UNICODE_ALL, in
836 * which case we don't know what the intersection is, but it's no
837 * greater than what cl already has, so can just leave it alone,
838 * with possible false positives */
839 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
840 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
871d0d1a 841 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 842 }
137165a6 843 }
c6b76537
KW
844 else if (! ANYOF_NONBITMAP(cl)) {
845
846 /* Here, 'and_with' does match something outside the bitmap, and cl
847 * doesn't have a list of things to match outside the bitmap. If
848 * cl can match all code points above 255, the intersection will
849 * be those above-255 code points that 'and_with' matches. There
850 * may be false positives from code points in 'and_with' that are
851 * outside the bitmap but below 256, but those get sorted out
852 * after the synthetic start class succeeds). If cl can't match
853 * all Unicode code points, it means here that it can't match *
854 * anything outside the bitmap, so we leave the bitmap empty */
855 if (cl->flags & ANYOF_UNICODE_ALL) {
856 ARG_SET(cl, ARG(and_with));
857 }
858 }
859 else {
860 /* Here, both 'and_with' and cl match something outside the
861 * bitmap. Currently we do not do the intersection, so just match
862 * whatever cl had at the beginning. */
863 }
864
865
866 /* Take the intersection of the two sets of flags */
867 cl->flags &= and_with->flags;
137165a6 868 }
653099ff
GS
869}
870
dd58aee1
KW
871/* 'OR' a given class with another one. Can create false positives. 'cl'
872 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
873 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 874STATIC void
3fffb88a 875S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 876{
7918f24d
NC
877 PERL_ARGS_ASSERT_CL_OR;
878
653099ff 879 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
880
881 /* Here, the or'd node is to be inverted. This means we take the
882 * complement of everything not in the bitmap, but currently we don't
883 * know what that is, so give up and match anything */
884 if (ANYOF_NONBITMAP(or_with)) {
3fffb88a 885 cl_anything(pRExC_state, cl);
c6b76537 886 }
653099ff
GS
887 /* We do not use
888 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
889 * <= (B1 | !B2) | (CL1 | !CL2)
890 * which is wasteful if CL2 is small, but we ignore CL2:
891 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
892 * XXXX Can we handle case-fold? Unclear:
893 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
894 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
895 */
c6b76537 896 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
897 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
898 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
899 int i;
900
901 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
902 cl->bitmap[i] |= ~or_with->bitmap[i];
903 } /* XXXX: logic is complicated otherwise */
904 else {
3fffb88a 905 cl_anything(pRExC_state, cl);
653099ff 906 }
c6b76537
KW
907
908 /* And, we can just take the union of the flags that aren't affected
909 * by the inversion */
910 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
911
912 /* For the remaining flags:
913 ANYOF_UNICODE_ALL and inverted means to not match anything above
914 255, which means that the union with cl should just be
915 what cl has in it, so can ignore this flag
916 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
917 is 127-255 to match them, but then invert that, so the
918 union with cl should just be what cl has in it, so can
919 ignore this flag
920 */
921 } else { /* 'or_with' is not inverted */
653099ff
GS
922 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
923 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
924 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
925 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
926 int i;
927
928 /* OR char bitmap and class bitmap separately */
929 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
930 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 931 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
932 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
933 cl->classflags[i] |= or_with->classflags[i];
934 cl->flags |= ANYOF_CLASS;
935 }
936 }
937 else { /* XXXX: logic is complicated, leave it along for a moment. */
3fffb88a 938 cl_anything(pRExC_state, cl);
653099ff 939 }
9826f543 940
c6b76537
KW
941 if (ANYOF_NONBITMAP(or_with)) {
942
943 /* Use the added node's outside-the-bit-map match if there isn't a
944 * conflict. If there is a conflict (both nodes match something
945 * outside the bitmap, but what they match outside is not the same
946 * pointer, and hence not easily compared until XXX we extend
947 * inversion lists this far), give up and allow the start class to
948 * match everything outside the bitmap */
949 if (! ANYOF_NONBITMAP(cl)) {
950 ARG_SET(cl, ARG(or_with));
951 }
952 else if (ARG(cl) != ARG(or_with)) {
953 cl->flags |= ANYOF_UNICODE_ALL;
954 }
0b9668ee
KW
955
956 /* Take the union */
957 cl->flags |= or_with->flags;
c6b76537 958 }
1aa99e6b 959 }
653099ff
GS
960}
961
a3621e74
YO
962#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
963#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
964#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
965#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
966
3dab1dad
YO
967
968#ifdef DEBUGGING
07be1b83 969/*
2b8b4781
NC
970 dump_trie(trie,widecharmap,revcharmap)
971 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
972 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
973
974 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
975 The _interim_ variants are used for debugging the interim
976 tables that are used to generate the final compressed
977 representation which is what dump_trie expects.
978
486ec47a 979 Part of the reason for their existence is to provide a form
3dab1dad 980 of documentation as to how the different representations function.
07be1b83
YO
981
982*/
3dab1dad
YO
983
984/*
3dab1dad
YO
985 Dumps the final compressed table form of the trie to Perl_debug_log.
986 Used for debugging make_trie().
987*/
b9a59e08 988
3dab1dad 989STATIC void
2b8b4781
NC
990S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
991 AV *revcharmap, U32 depth)
3dab1dad
YO
992{
993 U32 state;
ab3bbdeb 994 SV *sv=sv_newmortal();
55eed653 995 int colwidth= widecharmap ? 6 : 4;
2e64971a 996 U16 word;
3dab1dad
YO
997 GET_RE_DEBUG_FLAGS_DECL;
998
7918f24d 999 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1000
3dab1dad
YO
1001 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1002 (int)depth * 2 + 2,"",
1003 "Match","Base","Ofs" );
1004
1005 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1006 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1007 if ( tmp ) {
ab3bbdeb
YO
1008 PerlIO_printf( Perl_debug_log, "%*s",
1009 colwidth,
ddc5bc0f 1010 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1011 PL_colors[0], PL_colors[1],
1012 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1013 PERL_PV_ESCAPE_FIRSTCHAR
1014 )
1015 );
3dab1dad
YO
1016 }
1017 }
1018 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1019 (int)depth * 2 + 2,"");
1020
1021 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1022 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1023 PerlIO_printf( Perl_debug_log, "\n");
1024
1e2e3d02 1025 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1026 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1027
1028 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1029
1030 if ( trie->states[ state ].wordnum ) {
1031 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1032 } else {
1033 PerlIO_printf( Perl_debug_log, "%6s", "" );
1034 }
1035
1036 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1037
1038 if ( base ) {
1039 U32 ofs = 0;
1040
1041 while( ( base + ofs < trie->uniquecharcount ) ||
1042 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1043 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1044 ofs++;
1045
1046 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1047
1048 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1049 if ( ( base + ofs >= trie->uniquecharcount ) &&
1050 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1051 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1052 {
ab3bbdeb
YO
1053 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1054 colwidth,
3dab1dad
YO
1055 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1056 } else {
ab3bbdeb 1057 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1058 }
1059 }
1060
1061 PerlIO_printf( Perl_debug_log, "]");
1062
1063 }
1064 PerlIO_printf( Perl_debug_log, "\n" );
1065 }
2e64971a
DM
1066 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1067 for (word=1; word <= trie->wordcount; word++) {
1068 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1069 (int)word, (int)(trie->wordinfo[word].prev),
1070 (int)(trie->wordinfo[word].len));
1071 }
1072 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1073}
1074/*
3dab1dad
YO
1075 Dumps a fully constructed but uncompressed trie in list form.
1076 List tries normally only are used for construction when the number of
1077 possible chars (trie->uniquecharcount) is very high.
1078 Used for debugging make_trie().
1079*/
1080STATIC void
55eed653 1081S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1082 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1083 U32 depth)
3dab1dad
YO
1084{
1085 U32 state;
ab3bbdeb 1086 SV *sv=sv_newmortal();
55eed653 1087 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1088 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1089
1090 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1091
3dab1dad 1092 /* print out the table precompression. */
ab3bbdeb
YO
1093 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1094 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1095 "------:-----+-----------------\n" );
3dab1dad
YO
1096
1097 for( state=1 ; state < next_alloc ; state ++ ) {
1098 U16 charid;
1099
ab3bbdeb 1100 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1101 (int)depth * 2 + 2,"", (UV)state );
1102 if ( ! trie->states[ state ].wordnum ) {
1103 PerlIO_printf( Perl_debug_log, "%5s| ","");
1104 } else {
1105 PerlIO_printf( Perl_debug_log, "W%4x| ",
1106 trie->states[ state ].wordnum
1107 );
1108 }
1109 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1110 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1111 if ( tmp ) {
1112 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1113 colwidth,
ddc5bc0f 1114 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1115 PL_colors[0], PL_colors[1],
1116 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1117 PERL_PV_ESCAPE_FIRSTCHAR
1118 ) ,
1e2e3d02
YO
1119 TRIE_LIST_ITEM(state,charid).forid,
1120 (UV)TRIE_LIST_ITEM(state,charid).newstate
1121 );
1122 if (!(charid % 10))
664e119d
RGS
1123 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1124 (int)((depth * 2) + 14), "");
1e2e3d02 1125 }
ab3bbdeb
YO
1126 }
1127 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1128 }
1129}
1130
1131/*
3dab1dad
YO
1132 Dumps a fully constructed but uncompressed trie in table form.
1133 This is the normal DFA style state transition table, with a few
1134 twists to facilitate compression later.
1135 Used for debugging make_trie().
1136*/
1137STATIC void
55eed653 1138S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1139 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1140 U32 depth)
3dab1dad
YO
1141{
1142 U32 state;
1143 U16 charid;
ab3bbdeb 1144 SV *sv=sv_newmortal();
55eed653 1145 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1146 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1147
1148 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1149
1150 /*
1151 print out the table precompression so that we can do a visual check
1152 that they are identical.
1153 */
1154
1155 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1156
1157 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1158 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1159 if ( tmp ) {
ab3bbdeb
YO
1160 PerlIO_printf( Perl_debug_log, "%*s",
1161 colwidth,
ddc5bc0f 1162 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1163 PL_colors[0], PL_colors[1],
1164 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1165 PERL_PV_ESCAPE_FIRSTCHAR
1166 )
1167 );
3dab1dad
YO
1168 }
1169 }
1170
1171 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1172
1173 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1174 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1175 }
1176
1177 PerlIO_printf( Perl_debug_log, "\n" );
1178
1179 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1180
1181 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1182 (int)depth * 2 + 2,"",
1183 (UV)TRIE_NODENUM( state ) );
1184
1185 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1186 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1187 if (v)
1188 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1189 else
1190 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1191 }
1192 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1193 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1194 } else {
1195 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1196 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1197 }
1198 }
07be1b83 1199}
3dab1dad
YO
1200
1201#endif
1202
2e64971a 1203
786e8c11
YO
1204/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1205 startbranch: the first branch in the whole branch sequence
1206 first : start branch of sequence of branch-exact nodes.
1207 May be the same as startbranch
1208 last : Thing following the last branch.
1209 May be the same as tail.
1210 tail : item following the branch sequence
1211 count : words in the sequence
1212 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1213 depth : indent depth
3dab1dad 1214
786e8c11 1215Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1216
786e8c11
YO
1217A trie is an N'ary tree where the branches are determined by digital
1218decomposition of the key. IE, at the root node you look up the 1st character and
1219follow that branch repeat until you find the end of the branches. Nodes can be
1220marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1221
786e8c11 1222 /he|she|his|hers/
72f13be8 1223
786e8c11
YO
1224would convert into the following structure. Numbers represent states, letters
1225following numbers represent valid transitions on the letter from that state, if
1226the number is in square brackets it represents an accepting state, otherwise it
1227will be in parenthesis.
07be1b83 1228
786e8c11
YO
1229 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1230 | |
1231 | (2)
1232 | |
1233 (1) +-i->(6)-+-s->[7]
1234 |
1235 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1236
786e8c11
YO
1237 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1238
1239This shows that when matching against the string 'hers' we will begin at state 1
1240read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1241then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1242is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1243single traverse. We store a mapping from accepting to state to which word was
1244matched, and then when we have multiple possibilities we try to complete the
1245rest of the regex in the order in which they occured in the alternation.
1246
1247The only prior NFA like behaviour that would be changed by the TRIE support is
1248the silent ignoring of duplicate alternations which are of the form:
1249
1250 / (DUPE|DUPE) X? (?{ ... }) Y /x
1251
4b714af6 1252Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1253and without the optimisation. With the optimisations dupes will be silently
486ec47a 1254ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1255the following demonstrates:
1256
1257 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1258
1259which prints out 'word' three times, but
1260
1261 'words'=~/(word|word|word)(?{ print $1 })S/
1262
1263which doesnt print it out at all. This is due to other optimisations kicking in.
1264
1265Example of what happens on a structural level:
1266
486ec47a 1267The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1268
1269 1: CURLYM[1] {1,32767}(18)
1270 5: BRANCH(8)
1271 6: EXACT <ac>(16)
1272 8: BRANCH(11)
1273 9: EXACT <ad>(16)
1274 11: BRANCH(14)
1275 12: EXACT <ab>(16)
1276 16: SUCCEED(0)
1277 17: NOTHING(18)
1278 18: END(0)
1279
1280This would be optimizable with startbranch=5, first=5, last=16, tail=16
1281and should turn into:
1282
1283 1: CURLYM[1] {1,32767}(18)
1284 5: TRIE(16)
1285 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1286 <ac>
1287 <ad>
1288 <ab>
1289 16: SUCCEED(0)
1290 17: NOTHING(18)
1291 18: END(0)
1292
1293Cases where tail != last would be like /(?foo|bar)baz/:
1294
1295 1: BRANCH(4)
1296 2: EXACT <foo>(8)
1297 4: BRANCH(7)
1298 5: EXACT <bar>(8)
1299 7: TAIL(8)
1300 8: EXACT <baz>(10)
1301 10: END(0)
1302
1303which would be optimizable with startbranch=1, first=1, last=7, tail=8
1304and would end up looking like:
1305
1306 1: TRIE(8)
1307 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1308 <foo>
1309 <bar>
1310 7: TAIL(8)
1311 8: EXACT <baz>(10)
1312 10: END(0)
1313
1314 d = uvuni_to_utf8_flags(d, uv, 0);
1315
1316is the recommended Unicode-aware way of saying
1317
1318 *(d++) = uv;
1319*/
1320
1e2e3d02 1321#define TRIE_STORE_REVCHAR \
786e8c11 1322 STMT_START { \
73031816
NC
1323 if (UTF) { \
1324 SV *zlopp = newSV(2); \
88c9ea1e
CB
1325 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1326 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1327 SvCUR_set(zlopp, kapow - flrbbbbb); \
1328 SvPOK_on(zlopp); \
1329 SvUTF8_on(zlopp); \
1330 av_push(revcharmap, zlopp); \
1331 } else { \
6bdeddd2 1332 char ooooff = (char)uvc; \
73031816
NC
1333 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1334 } \
1335 } STMT_END
786e8c11
YO
1336
1337#define TRIE_READ_CHAR STMT_START { \
1338 wordlen++; \
1339 if ( UTF ) { \
1340 if ( folder ) { \
1341 if ( foldlen > 0 ) { \
1342 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1343 foldlen -= len; \
1344 scan += len; \
1345 len = 0; \
1346 } else { \
1347 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1348 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1349 foldlen -= UNISKIP( uvc ); \
1350 scan = foldbuf + UNISKIP( uvc ); \
1351 } \
1352 } else { \
1353 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1354 } \
1355 } else { \
1356 uvc = (U32)*uc; \
1357 len = 1; \
1358 } \
1359} STMT_END
1360
1361
1362
1363#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1364 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1365 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1366 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1367 } \
1368 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1369 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1370 TRIE_LIST_CUR( state )++; \
1371} STMT_END
07be1b83 1372
786e8c11
YO
1373#define TRIE_LIST_NEW(state) STMT_START { \
1374 Newxz( trie->states[ state ].trans.list, \
1375 4, reg_trie_trans_le ); \
1376 TRIE_LIST_CUR( state ) = 1; \
1377 TRIE_LIST_LEN( state ) = 4; \
1378} STMT_END
07be1b83 1379
786e8c11
YO
1380#define TRIE_HANDLE_WORD(state) STMT_START { \
1381 U16 dupe= trie->states[ state ].wordnum; \
1382 regnode * const noper_next = regnext( noper ); \
1383 \
786e8c11
YO
1384 DEBUG_r({ \
1385 /* store the word for dumping */ \
1386 SV* tmp; \
1387 if (OP(noper) != NOTHING) \
740cce10 1388 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1389 else \
740cce10 1390 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1391 av_push( trie_words, tmp ); \
786e8c11
YO
1392 }); \
1393 \
1394 curword++; \
2e64971a
DM
1395 trie->wordinfo[curword].prev = 0; \
1396 trie->wordinfo[curword].len = wordlen; \
1397 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1398 \
1399 if ( noper_next < tail ) { \
1400 if (!trie->jump) \
c944940b 1401 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1402 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1403 if (!jumper) \
1404 jumper = noper_next; \
1405 if (!nextbranch) \
1406 nextbranch= regnext(cur); \
1407 } \
1408 \
1409 if ( dupe ) { \
2e64971a
DM
1410 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1411 /* chain, so that when the bits of chain are later */\
1412 /* linked together, the dups appear in the chain */\
1413 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1414 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1415 } else { \
1416 /* we haven't inserted this word yet. */ \
1417 trie->states[ state ].wordnum = curword; \
1418 } \
1419} STMT_END
07be1b83 1420
3dab1dad 1421
786e8c11
YO
1422#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1423 ( ( base + charid >= ucharcount \
1424 && base + charid < ubound \
1425 && state == trie->trans[ base - ucharcount + charid ].check \
1426 && trie->trans[ base - ucharcount + charid ].next ) \
1427 ? trie->trans[ base - ucharcount + charid ].next \
1428 : ( state==1 ? special : 0 ) \
1429 )
3dab1dad 1430
786e8c11
YO
1431#define MADE_TRIE 1
1432#define MADE_JUMP_TRIE 2
1433#define MADE_EXACT_TRIE 4
3dab1dad 1434
a3621e74 1435STATIC I32
786e8c11 1436S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1437{
27da23d5 1438 dVAR;
a3621e74
YO
1439 /* first pass, loop through and scan words */
1440 reg_trie_data *trie;
55eed653 1441 HV *widecharmap = NULL;
2b8b4781 1442 AV *revcharmap = newAV();
a3621e74 1443 regnode *cur;
9f7f3913 1444 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1445 STRLEN len = 0;
1446 UV uvc = 0;
1447 U16 curword = 0;
1448 U32 next_alloc = 0;
786e8c11
YO
1449 regnode *jumper = NULL;
1450 regnode *nextbranch = NULL;
7f69552c 1451 regnode *convert = NULL;
2e64971a 1452 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1453 /* we just use folder as a flag in utf8 */
1e696034 1454 const U8 * folder = NULL;
a3621e74 1455
2b8b4781
NC
1456#ifdef DEBUGGING
1457 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1458 AV *trie_words = NULL;
1459 /* along with revcharmap, this only used during construction but both are
1460 * useful during debugging so we store them in the struct when debugging.
8e11feef 1461 */
2b8b4781
NC
1462#else
1463 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1464 STRLEN trie_charcount=0;
3dab1dad 1465#endif
2b8b4781 1466 SV *re_trie_maxbuff;
a3621e74 1467 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1468
1469 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1470#ifndef DEBUGGING
1471 PERL_UNUSED_ARG(depth);
1472#endif
a3621e74 1473
1e696034 1474 switch (flags) {
2f7f8cb1 1475 case EXACTFA:
1e696034
KW
1476 case EXACTFU: folder = PL_fold_latin1; break;
1477 case EXACTF: folder = PL_fold; break;
1478 case EXACTFL: folder = PL_fold_locale; break;
1479 }
1480
c944940b 1481 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1482 trie->refcount = 1;
3dab1dad 1483 trie->startstate = 1;
786e8c11 1484 trie->wordcount = word_count;
f8fc2ecf 1485 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1486 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1487 if (!(UTF && folder))
c944940b 1488 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1489 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1490 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1491
a3621e74 1492 DEBUG_r({
2b8b4781 1493 trie_words = newAV();
a3621e74 1494 });
a3621e74 1495
0111c4fd 1496 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1497 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1498 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1499 }
3dab1dad
YO
1500 DEBUG_OPTIMISE_r({
1501 PerlIO_printf( Perl_debug_log,
786e8c11 1502 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1503 (int)depth * 2 + 2, "",
1504 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1505 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1506 (int)depth);
3dab1dad 1507 });
7f69552c
YO
1508
1509 /* Find the node we are going to overwrite */
1510 if ( first == startbranch && OP( last ) != BRANCH ) {
1511 /* whole branch chain */
1512 convert = first;
1513 } else {
1514 /* branch sub-chain */
1515 convert = NEXTOPER( first );
1516 }
1517
a3621e74
YO
1518 /* -- First loop and Setup --
1519
1520 We first traverse the branches and scan each word to determine if it
1521 contains widechars, and how many unique chars there are, this is
1522 important as we have to build a table with at least as many columns as we
1523 have unique chars.
1524
1525 We use an array of integers to represent the character codes 0..255
38a44b82 1526 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1527 native representation of the character value as the key and IV's for the
1528 coded index.
1529
1530 *TODO* If we keep track of how many times each character is used we can
1531 remap the columns so that the table compression later on is more
3b753521 1532 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1533 middle and the least common are on the outside. IMO this would be better
1534 than a most to least common mapping as theres a decent chance the most
1535 common letter will share a node with the least common, meaning the node
486ec47a 1536 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1537 case is when we have the least common nodes twice.
1538
1539 */
1540
a3621e74 1541 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1542 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1543 const U8 *uc = (U8*)STRING( noper );
a28509cc 1544 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1545 STRLEN foldlen = 0;
1546 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1547 const U8 *scan = (U8*)NULL;
07be1b83 1548 U32 wordlen = 0; /* required init */
02daf0ab
YO
1549 STRLEN chars = 0;
1550 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1551
3dab1dad
YO
1552 if (OP(noper) == NOTHING) {
1553 trie->minlen= 0;
1554 continue;
1555 }
02daf0ab
YO
1556 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1557 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1558 regardless of encoding */
1559
a3621e74 1560 for ( ; uc < e ; uc += len ) {
3dab1dad 1561 TRIE_CHARCOUNT(trie)++;
a3621e74 1562 TRIE_READ_CHAR;
3dab1dad 1563 chars++;
a3621e74
YO
1564 if ( uvc < 256 ) {
1565 if ( !trie->charmap[ uvc ] ) {
1566 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1567 if ( folder )
1568 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1569 TRIE_STORE_REVCHAR;
a3621e74 1570 }
02daf0ab 1571 if ( set_bit ) {
62012aee
KW
1572 /* store the codepoint in the bitmap, and its folded
1573 * equivalent. */
02daf0ab 1574 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1575
1576 /* store the folded codepoint */
1577 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1578
1579 if ( !UTF ) {
1580 /* store first byte of utf8 representation of
acdf4139
KW
1581 variant codepoints */
1582 if (! UNI_IS_INVARIANT(uvc)) {
1583 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1584 }
1585 }
02daf0ab
YO
1586 set_bit = 0; /* We've done our bit :-) */
1587 }
a3621e74
YO
1588 } else {
1589 SV** svpp;
55eed653
NC
1590 if ( !widecharmap )
1591 widecharmap = newHV();
a3621e74 1592
55eed653 1593 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1594
1595 if ( !svpp )
e4584336 1596 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1597
1598 if ( !SvTRUE( *svpp ) ) {
1599 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1600 TRIE_STORE_REVCHAR;
a3621e74
YO
1601 }
1602 }
1603 }
3dab1dad
YO
1604 if( cur == first ) {
1605 trie->minlen=chars;
1606 trie->maxlen=chars;
1607 } else if (chars < trie->minlen) {
1608 trie->minlen=chars;
1609 } else if (chars > trie->maxlen) {
1610 trie->maxlen=chars;
1611 }
1612
a3621e74
YO
1613 } /* end first pass */
1614 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1615 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1616 (int)depth * 2 + 2,"",
55eed653 1617 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1618 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1619 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1620 );
a3621e74
YO
1621
1622 /*
1623 We now know what we are dealing with in terms of unique chars and
1624 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1625 representation using a flat table will take. If it's over a reasonable
1626 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1627 conservative but potentially much slower representation using an array
1628 of lists.
1629
1630 At the end we convert both representations into the same compressed
1631 form that will be used in regexec.c for matching with. The latter
1632 is a form that cannot be used to construct with but has memory
1633 properties similar to the list form and access properties similar
1634 to the table form making it both suitable for fast searches and
1635 small enough that its feasable to store for the duration of a program.
1636
1637 See the comment in the code where the compressed table is produced
1638 inplace from the flat tabe representation for an explanation of how
1639 the compression works.
1640
1641 */
1642
1643
2e64971a
DM
1644 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1645 prev_states[1] = 0;
1646
3dab1dad 1647 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1648 /*
1649 Second Pass -- Array Of Lists Representation
1650
1651 Each state will be represented by a list of charid:state records
1652 (reg_trie_trans_le) the first such element holds the CUR and LEN
1653 points of the allocated array. (See defines above).
1654
1655 We build the initial structure using the lists, and then convert
1656 it into the compressed table form which allows faster lookups
1657 (but cant be modified once converted).
a3621e74
YO
1658 */
1659
a3621e74
YO
1660 STRLEN transcount = 1;
1661
1e2e3d02
YO
1662 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1663 "%*sCompiling trie using list compiler\n",
1664 (int)depth * 2 + 2, ""));
446bd890 1665
c944940b
JH
1666 trie->states = (reg_trie_state *)
1667 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1668 sizeof(reg_trie_state) );
a3621e74
YO
1669 TRIE_LIST_NEW(1);
1670 next_alloc = 2;
1671
1672 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1673
c445ea15
AL
1674 regnode * const noper = NEXTOPER( cur );
1675 U8 *uc = (U8*)STRING( noper );
1676 const U8 * const e = uc + STR_LEN( noper );
1677 U32 state = 1; /* required init */
1678 U16 charid = 0; /* sanity init */
1679 U8 *scan = (U8*)NULL; /* sanity init */
1680 STRLEN foldlen = 0; /* required init */
07be1b83 1681 U32 wordlen = 0; /* required init */
c445ea15
AL
1682 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1683
3dab1dad 1684 if (OP(noper) != NOTHING) {
786e8c11 1685 for ( ; uc < e ; uc += len ) {
c445ea15 1686
786e8c11 1687 TRIE_READ_CHAR;
c445ea15 1688
786e8c11
YO
1689 if ( uvc < 256 ) {
1690 charid = trie->charmap[ uvc ];
c445ea15 1691 } else {
55eed653 1692 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1693 if ( !svpp ) {
1694 charid = 0;
1695 } else {
1696 charid=(U16)SvIV( *svpp );
1697 }
c445ea15 1698 }
786e8c11
YO
1699 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1700 if ( charid ) {
a3621e74 1701
786e8c11
YO
1702 U16 check;
1703 U32 newstate = 0;
a3621e74 1704
786e8c11
YO
1705 charid--;
1706 if ( !trie->states[ state ].trans.list ) {
1707 TRIE_LIST_NEW( state );
c445ea15 1708 }
786e8c11
YO
1709 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1710 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1711 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1712 break;
1713 }
1714 }
1715 if ( ! newstate ) {
1716 newstate = next_alloc++;
2e64971a 1717 prev_states[newstate] = state;
786e8c11
YO
1718 TRIE_LIST_PUSH( state, charid, newstate );
1719 transcount++;
1720 }
1721 state = newstate;
1722 } else {
1723 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1724 }
a28509cc 1725 }
c445ea15 1726 }
3dab1dad 1727 TRIE_HANDLE_WORD(state);
a3621e74
YO
1728
1729 } /* end second pass */
1730
1e2e3d02
YO
1731 /* next alloc is the NEXT state to be allocated */
1732 trie->statecount = next_alloc;
c944940b
JH
1733 trie->states = (reg_trie_state *)
1734 PerlMemShared_realloc( trie->states,
1735 next_alloc
1736 * sizeof(reg_trie_state) );
a3621e74 1737
3dab1dad 1738 /* and now dump it out before we compress it */
2b8b4781
NC
1739 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1740 revcharmap, next_alloc,
1741 depth+1)
1e2e3d02 1742 );
a3621e74 1743
c944940b
JH
1744 trie->trans = (reg_trie_trans *)
1745 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1746 {
1747 U32 state;
a3621e74
YO
1748 U32 tp = 0;
1749 U32 zp = 0;
1750
1751
1752 for( state=1 ; state < next_alloc ; state ++ ) {
1753 U32 base=0;
1754
1755 /*
1756 DEBUG_TRIE_COMPILE_MORE_r(
1757 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1758 );
1759 */
1760
1761 if (trie->states[state].trans.list) {
1762 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1763 U16 maxid=minid;
a28509cc 1764 U16 idx;
a3621e74
YO
1765
1766 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1767 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1768 if ( forid < minid ) {
1769 minid=forid;
1770 } else if ( forid > maxid ) {
1771 maxid=forid;
1772 }
a3621e74
YO
1773 }
1774 if ( transcount < tp + maxid - minid + 1) {
1775 transcount *= 2;
c944940b
JH
1776 trie->trans = (reg_trie_trans *)
1777 PerlMemShared_realloc( trie->trans,
446bd890
NC
1778 transcount
1779 * sizeof(reg_trie_trans) );
a3621e74
YO
1780 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1781 }
1782 base = trie->uniquecharcount + tp - minid;
1783 if ( maxid == minid ) {
1784 U32 set = 0;
1785 for ( ; zp < tp ; zp++ ) {
1786 if ( ! trie->trans[ zp ].next ) {
1787 base = trie->uniquecharcount + zp - minid;
1788 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1789 trie->trans[ zp ].check = state;
1790 set = 1;
1791 break;
1792 }
1793 }
1794 if ( !set ) {
1795 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1796 trie->trans[ tp ].check = state;
1797 tp++;
1798 zp = tp;
1799 }
1800 } else {
1801 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1802 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1803 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1804 trie->trans[ tid ].check = state;
1805 }
1806 tp += ( maxid - minid + 1 );
1807 }
1808 Safefree(trie->states[ state ].trans.list);
1809 }
1810 /*
1811 DEBUG_TRIE_COMPILE_MORE_r(
1812 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1813 );
1814 */
1815 trie->states[ state ].trans.base=base;
1816 }
cc601c31 1817 trie->lasttrans = tp + 1;
a3621e74
YO
1818 }
1819 } else {
1820 /*
1821 Second Pass -- Flat Table Representation.
1822
1823 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1824 We know that we will need Charcount+1 trans at most to store the data
1825 (one row per char at worst case) So we preallocate both structures
1826 assuming worst case.
1827
1828 We then construct the trie using only the .next slots of the entry
1829 structs.
1830
3b753521 1831 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1832 make compression both faster and easier by keeping track of how many non
1833 zero fields are in the node.
1834
1835 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1836 transition.
1837
1838 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1839 number representing the first entry of the node, and state as a
1840 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1841 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1842 are 2 entrys per node. eg:
1843
1844 A B A B
1845 1. 2 4 1. 3 7
1846 2. 0 3 3. 0 5
1847 3. 0 0 5. 0 0
1848 4. 0 0 7. 0 0
1849
1850 The table is internally in the right hand, idx form. However as we also
1851 have to deal with the states array which is indexed by nodenum we have to
1852 use TRIE_NODENUM() to convert.
1853
1854 */
1e2e3d02
YO
1855 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1856 "%*sCompiling trie using table compiler\n",
1857 (int)depth * 2 + 2, ""));
3dab1dad 1858
c944940b
JH
1859 trie->trans = (reg_trie_trans *)
1860 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1861 * trie->uniquecharcount + 1,
1862 sizeof(reg_trie_trans) );
1863 trie->states = (reg_trie_state *)
1864 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1865 sizeof(reg_trie_state) );
a3621e74
YO
1866 next_alloc = trie->uniquecharcount + 1;
1867
3dab1dad 1868
a3621e74
YO
1869 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1870
c445ea15 1871 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1872 const U8 *uc = (U8*)STRING( noper );
1873 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1874
1875 U32 state = 1; /* required init */
1876
1877 U16 charid = 0; /* sanity init */
1878 U32 accept_state = 0; /* sanity init */
1879 U8 *scan = (U8*)NULL; /* sanity init */
1880
1881 STRLEN foldlen = 0; /* required init */
07be1b83 1882 U32 wordlen = 0; /* required init */
a3621e74
YO
1883 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1884
3dab1dad 1885 if ( OP(noper) != NOTHING ) {
786e8c11 1886 for ( ; uc < e ; uc += len ) {
a3621e74 1887
786e8c11 1888 TRIE_READ_CHAR;
a3621e74 1889
786e8c11
YO
1890 if ( uvc < 256 ) {
1891 charid = trie->charmap[ uvc ];
1892 } else {
55eed653 1893 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1894 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1895 }
786e8c11
YO
1896 if ( charid ) {
1897 charid--;
1898 if ( !trie->trans[ state + charid ].next ) {
1899 trie->trans[ state + charid ].next = next_alloc;
1900 trie->trans[ state ].check++;
2e64971a
DM
1901 prev_states[TRIE_NODENUM(next_alloc)]
1902 = TRIE_NODENUM(state);
786e8c11
YO
1903 next_alloc += trie->uniquecharcount;
1904 }
1905 state = trie->trans[ state + charid ].next;
1906 } else {
1907 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1908 }
1909 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1910 }
a3621e74 1911 }
3dab1dad
YO
1912 accept_state = TRIE_NODENUM( state );
1913 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1914
1915 } /* end second pass */
1916
3dab1dad 1917 /* and now dump it out before we compress it */
2b8b4781
NC
1918 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1919 revcharmap,
1920 next_alloc, depth+1));
a3621e74 1921
a3621e74
YO
1922 {
1923 /*
1924 * Inplace compress the table.*
1925
1926 For sparse data sets the table constructed by the trie algorithm will
1927 be mostly 0/FAIL transitions or to put it another way mostly empty.
1928 (Note that leaf nodes will not contain any transitions.)
1929
1930 This algorithm compresses the tables by eliminating most such
1931 transitions, at the cost of a modest bit of extra work during lookup:
1932
1933 - Each states[] entry contains a .base field which indicates the
1934 index in the state[] array wheres its transition data is stored.
1935
3b753521 1936 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1937
1938 - If .base is nonzero then charid is added to it to find an entry in
1939 the trans array.
1940
1941 -If trans[states[state].base+charid].check!=state then the
1942 transition is taken to be a 0/Fail transition. Thus if there are fail
1943 transitions at the front of the node then the .base offset will point
1944 somewhere inside the previous nodes data (or maybe even into a node
1945 even earlier), but the .check field determines if the transition is
1946 valid.
1947
786e8c11 1948 XXX - wrong maybe?
a3621e74 1949 The following process inplace converts the table to the compressed
3b753521 1950 table: We first do not compress the root node 1,and mark all its
a3621e74 1951 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1952 allows us to do a DFA construction from the compressed table later,
1953 and ensures that any .base pointers we calculate later are greater
1954 than 0.
a3621e74
YO
1955
1956 - We set 'pos' to indicate the first entry of the second node.
1957
1958 - We then iterate over the columns of the node, finding the first and
1959 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1960 and set the .check pointers accordingly, and advance pos
1961 appropriately and repreat for the next node. Note that when we copy
1962 the next pointers we have to convert them from the original
1963 NODEIDX form to NODENUM form as the former is not valid post
1964 compression.
1965
1966 - If a node has no transitions used we mark its base as 0 and do not
1967 advance the pos pointer.
1968
1969 - If a node only has one transition we use a second pointer into the
1970 structure to fill in allocated fail transitions from other states.
1971 This pointer is independent of the main pointer and scans forward
1972 looking for null transitions that are allocated to a state. When it
1973 finds one it writes the single transition into the "hole". If the
786e8c11 1974 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1975
1976 - Once compressed we can Renew/realloc the structures to release the
1977 excess space.
1978
1979 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1980 specifically Fig 3.47 and the associated pseudocode.
1981
1982 demq
1983 */
a3b680e6 1984 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1985 U32 state, charid;
a3621e74 1986 U32 pos = 0, zp=0;
1e2e3d02 1987 trie->statecount = laststate;
a3621e74
YO
1988
1989 for ( state = 1 ; state < laststate ; state++ ) {
1990 U8 flag = 0;
a28509cc
AL
1991 const U32 stateidx = TRIE_NODEIDX( state );
1992 const U32 o_used = trie->trans[ stateidx ].check;
1993 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1994 trie->trans[ stateidx ].check = 0;
1995
1996 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1997 if ( flag || trie->trans[ stateidx + charid ].next ) {
1998 if ( trie->trans[ stateidx + charid ].next ) {
1999 if (o_used == 1) {
2000 for ( ; zp < pos ; zp++ ) {
2001 if ( ! trie->trans[ zp ].next ) {
2002 break;
2003 }
2004 }
2005 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2006 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2007 trie->trans[ zp ].check = state;
2008 if ( ++zp > pos ) pos = zp;
2009 break;
2010 }
2011 used--;
2012 }
2013 if ( !flag ) {
2014 flag = 1;
2015 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2016 }
2017 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2018 trie->trans[ pos ].check = state;
2019 pos++;
2020 }
2021 }
2022 }
cc601c31 2023 trie->lasttrans = pos + 1;
c944940b
JH
2024 trie->states = (reg_trie_state *)
2025 PerlMemShared_realloc( trie->states, laststate
2026 * sizeof(reg_trie_state) );
a3621e74 2027 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2028 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2029 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2030 (int)depth * 2 + 2,"",
2031 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2032 (IV)next_alloc,
2033 (IV)pos,
a3621e74
YO
2034 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2035 );
2036
2037 } /* end table compress */
2038 }
1e2e3d02
YO
2039 DEBUG_TRIE_COMPILE_MORE_r(
2040 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2041 (int)depth * 2 + 2, "",
2042 (UV)trie->statecount,
2043 (UV)trie->lasttrans)
2044 );
cc601c31 2045 /* resize the trans array to remove unused space */
c944940b
JH
2046 trie->trans = (reg_trie_trans *)
2047 PerlMemShared_realloc( trie->trans, trie->lasttrans
2048 * sizeof(reg_trie_trans) );
a3621e74 2049
3b753521 2050 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2051 U8 nodetype =(U8)(flags & 0xFF);
2052 char *str=NULL;
786e8c11 2053
07be1b83 2054#ifdef DEBUGGING
e62cc96a 2055 regnode *optimize = NULL;
7122b237
YO
2056#ifdef RE_TRACK_PATTERN_OFFSETS
2057
b57a0404
JH
2058 U32 mjd_offset = 0;
2059 U32 mjd_nodelen = 0;
7122b237
YO
2060#endif /* RE_TRACK_PATTERN_OFFSETS */
2061#endif /* DEBUGGING */
a3621e74 2062 /*
3dab1dad
YO
2063 This means we convert either the first branch or the first Exact,
2064 depending on whether the thing following (in 'last') is a branch
2065 or not and whther first is the startbranch (ie is it a sub part of
2066 the alternation or is it the whole thing.)
3b753521 2067 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2068 the whole branch sequence, including the first.
a3621e74 2069 */
3dab1dad 2070 /* Find the node we are going to overwrite */
7f69552c 2071 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2072 /* branch sub-chain */
3dab1dad 2073 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2074#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2075 DEBUG_r({
2076 mjd_offset= Node_Offset((convert));
2077 mjd_nodelen= Node_Length((convert));
2078 });
7122b237 2079#endif
7f69552c 2080 /* whole branch chain */
7122b237
YO
2081 }
2082#ifdef RE_TRACK_PATTERN_OFFSETS
2083 else {
7f69552c
YO
2084 DEBUG_r({
2085 const regnode *nop = NEXTOPER( convert );
2086 mjd_offset= Node_Offset((nop));
2087 mjd_nodelen= Node_Length((nop));
2088 });
07be1b83
YO
2089 }
2090 DEBUG_OPTIMISE_r(
2091 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2092 (int)depth * 2 + 2, "",
786e8c11 2093 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2094 );
7122b237 2095#endif
3dab1dad
YO
2096 /* But first we check to see if there is a common prefix we can
2097 split out as an EXACT and put in front of the TRIE node. */
2098 trie->startstate= 1;
55eed653 2099 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2100 U32 state;
1e2e3d02 2101 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2102 U32 ofs = 0;
8e11feef
RGS
2103 I32 idx = -1;
2104 U32 count = 0;
2105 const U32 base = trie->states[ state ].trans.base;
a3621e74 2106
3dab1dad 2107 if ( trie->states[state].wordnum )
8e11feef 2108 count = 1;
a3621e74 2109
8e11feef 2110 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2111 if ( ( base + ofs >= trie->uniquecharcount ) &&
2112 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2113 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2114 {
3dab1dad 2115 if ( ++count > 1 ) {
2b8b4781 2116 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2117 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2118 if ( state == 1 ) break;
3dab1dad
YO
2119 if ( count == 2 ) {
2120 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2121 DEBUG_OPTIMISE_r(
8e11feef
RGS
2122 PerlIO_printf(Perl_debug_log,
2123 "%*sNew Start State=%"UVuf" Class: [",
2124 (int)depth * 2 + 2, "",
786e8c11 2125 (UV)state));
be8e71aa 2126 if (idx >= 0) {
2b8b4781 2127 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2128 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2129
3dab1dad 2130 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2131 if ( folder )
2132 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2133 DEBUG_OPTIMISE_r(
f1f66076 2134 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2135 );
8e11feef
RGS
2136 }
2137 }
2138 TRIE_BITMAP_SET(trie,*ch);
2139 if ( folder )
2140 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2141 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2142 }
2143 idx = ofs;
2144 }
3dab1dad
YO
2145 }
2146 if ( count == 1 ) {
2b8b4781 2147 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2148 STRLEN len;
2149 char *ch = SvPV( *tmp, len );
de734bd5
A
2150 DEBUG_OPTIMISE_r({
2151 SV *sv=sv_newmortal();
8e11feef
RGS
2152 PerlIO_printf( Perl_debug_log,
2153 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2154 (int)depth * 2 + 2, "",
de734bd5
A
2155 (UV)state, (UV)idx,
2156 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2157 PL_colors[0], PL_colors[1],
2158 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2159 PERL_PV_ESCAPE_FIRSTCHAR
2160 )
2161 );
2162 });
3dab1dad
YO
2163 if ( state==1 ) {
2164 OP( convert ) = nodetype;
2165 str=STRING(convert);
2166 STR_LEN(convert)=0;
2167 }
c490c714
YO
2168 STR_LEN(convert) += len;
2169 while (len--)
de734bd5 2170 *str++ = *ch++;
8e11feef 2171 } else {
f9049ba1 2172#ifdef DEBUGGING
8e11feef
RGS
2173 if (state>1)
2174 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2175#endif
8e11feef
RGS
2176 break;
2177 }
2178 }
2e64971a 2179 trie->prefixlen = (state-1);
3dab1dad 2180 if (str) {
8e11feef 2181 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2182 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2183 trie->startstate = state;
07be1b83
YO
2184 trie->minlen -= (state - 1);
2185 trie->maxlen -= (state - 1);
33809eae
JH
2186#ifdef DEBUGGING
2187 /* At least the UNICOS C compiler choked on this
2188 * being argument to DEBUG_r(), so let's just have
2189 * it right here. */
2190 if (
2191#ifdef PERL_EXT_RE_BUILD
2192 1
2193#else
2194 DEBUG_r_TEST
2195#endif
2196 ) {
2197 regnode *fix = convert;
2198 U32 word = trie->wordcount;
2199 mjd_nodelen++;
2200 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2201 while( ++fix < n ) {
2202 Set_Node_Offset_Length(fix, 0, 0);
2203 }
2204 while (word--) {
2205 SV ** const tmp = av_fetch( trie_words, word, 0 );
2206 if (tmp) {
2207 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2208 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2209 else
2210 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2211 }
2212 }
2213 }
2214#endif
8e11feef
RGS
2215 if (trie->maxlen) {
2216 convert = n;
2217 } else {
3dab1dad 2218 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2219 DEBUG_r(optimize= n);
3dab1dad
YO
2220 }
2221 }
2222 }
a5ca303d
YO
2223 if (!jumper)
2224 jumper = last;
3dab1dad 2225 if ( trie->maxlen ) {
8e11feef
RGS
2226 NEXT_OFF( convert ) = (U16)(tail - convert);
2227 ARG_SET( convert, data_slot );
786e8c11
YO
2228 /* Store the offset to the first unabsorbed branch in
2229 jump[0], which is otherwise unused by the jump logic.
2230 We use this when dumping a trie and during optimisation. */
2231 if (trie->jump)
7f69552c 2232 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2233
6c48061a
YO
2234 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2235 * and there is a bitmap
2236 * and the first "jump target" node we found leaves enough room
2237 * then convert the TRIE node into a TRIEC node, with the bitmap
2238 * embedded inline in the opcode - this is hypothetically faster.
2239 */
2240 if ( !trie->states[trie->startstate].wordnum
2241 && trie->bitmap
2242 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2243 {
2244 OP( convert ) = TRIEC;
2245 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2246 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2247 trie->bitmap= NULL;
2248 } else
2249 OP( convert ) = TRIE;
a3621e74 2250
3dab1dad
YO
2251 /* store the type in the flags */
2252 convert->flags = nodetype;
a5ca303d
YO
2253 DEBUG_r({
2254 optimize = convert
2255 + NODE_STEP_REGNODE
2256 + regarglen[ OP( convert ) ];
2257 });
2258 /* XXX We really should free up the resource in trie now,
2259 as we won't use them - (which resources?) dmq */
3dab1dad 2260 }
a3621e74 2261 /* needed for dumping*/
e62cc96a 2262 DEBUG_r(if (optimize) {
07be1b83 2263 regnode *opt = convert;
bcdf7404 2264
e62cc96a 2265 while ( ++opt < optimize) {
07be1b83
YO
2266 Set_Node_Offset_Length(opt,0,0);
2267 }
786e8c11
YO
2268 /*
2269 Try to clean up some of the debris left after the
2270 optimisation.
a3621e74 2271 */
786e8c11 2272 while( optimize < jumper ) {
07be1b83 2273 mjd_nodelen += Node_Length((optimize));
a3621e74 2274 OP( optimize ) = OPTIMIZED;
07be1b83 2275 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2276 optimize++;
2277 }
07be1b83 2278 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2279 });
2280 } /* end node insert */
2e64971a
DM
2281
2282 /* Finish populating the prev field of the wordinfo array. Walk back
2283 * from each accept state until we find another accept state, and if
2284 * so, point the first word's .prev field at the second word. If the
2285 * second already has a .prev field set, stop now. This will be the
2286 * case either if we've already processed that word's accept state,
3b753521
FN
2287 * or that state had multiple words, and the overspill words were
2288 * already linked up earlier.
2e64971a
DM
2289 */
2290 {
2291 U16 word;
2292 U32 state;
2293 U16 prev;
2294
2295 for (word=1; word <= trie->wordcount; word++) {
2296 prev = 0;
2297 if (trie->wordinfo[word].prev)
2298 continue;
2299 state = trie->wordinfo[word].accept;
2300 while (state) {
2301 state = prev_states[state];
2302 if (!state)
2303 break;
2304 prev = trie->states[state].wordnum;
2305 if (prev)
2306 break;
2307 }
2308 trie->wordinfo[word].prev = prev;
2309 }
2310 Safefree(prev_states);
2311 }
2312
2313
2314 /* and now dump out the compressed format */
2315 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2316
55eed653 2317 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2318#ifdef DEBUGGING
2319 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2320 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2321#else
2322 SvREFCNT_dec(revcharmap);
07be1b83 2323#endif
786e8c11
YO
2324 return trie->jump
2325 ? MADE_JUMP_TRIE
2326 : trie->startstate>1
2327 ? MADE_EXACT_TRIE
2328 : MADE_TRIE;
2329}
2330
2331STATIC void
2332S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2333{
3b753521 2334/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2335
2336 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2337 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2338 ISBN 0-201-10088-6
2339
2340 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2341 suffix of the current state's 'word' that is also a proper prefix of another word in our
2342 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2343 the DFA not to have to restart after its tried and failed a word at a given point, it
2344 simply continues as though it had been matching the other word in the first place.
2345 Consider
2346 'abcdgu'=~/abcdefg|cdgu/
2347 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2348 fail, which would bring us to the state representing 'd' in the second word where we would
2349 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2350 */
2351 /* add a fail transition */
3251b653
NC
2352 const U32 trie_offset = ARG(source);
2353 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2354 U32 *q;
2355 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2356 const U32 numstates = trie->statecount;
786e8c11
YO
2357 const U32 ubound = trie->lasttrans + ucharcount;
2358 U32 q_read = 0;
2359 U32 q_write = 0;
2360 U32 charid;
2361 U32 base = trie->states[ 1 ].trans.base;
2362 U32 *fail;
2363 reg_ac_data *aho;
2364 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2365 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2366
2367 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2368#ifndef DEBUGGING
2369 PERL_UNUSED_ARG(depth);
2370#endif
2371
2372
2373 ARG_SET( stclass, data_slot );
c944940b 2374 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2375 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2376 aho->trie=trie_offset;
446bd890
NC
2377 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2378 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2379 Newxz( q, numstates, U32);
c944940b 2380 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2381 aho->refcount = 1;
2382 fail = aho->fail;
2383 /* initialize fail[0..1] to be 1 so that we always have
2384 a valid final fail state */
2385 fail[ 0 ] = fail[ 1 ] = 1;
2386
2387 for ( charid = 0; charid < ucharcount ; charid++ ) {
2388 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2389 if ( newstate ) {
2390 q[ q_write ] = newstate;
2391 /* set to point at the root */
2392 fail[ q[ q_write++ ] ]=1;
2393 }
2394 }
2395 while ( q_read < q_write) {
2396 const U32 cur = q[ q_read++ % numstates ];
2397 base = trie->states[ cur ].trans.base;
2398
2399 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2400 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2401 if (ch_state) {
2402 U32 fail_state = cur;
2403 U32 fail_base;
2404 do {
2405 fail_state = fail[ fail_state ];
2406 fail_base = aho->states[ fail_state ].trans.base;
2407 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2408
2409 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2410 fail[ ch_state ] = fail_state;
2411 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2412 {
2413 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2414 }
2415 q[ q_write++ % numstates] = ch_state;
2416 }
2417 }
2418 }
2419 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2420 when we fail in state 1, this allows us to use the
2421 charclass scan to find a valid start char. This is based on the principle
2422 that theres a good chance the string being searched contains lots of stuff
2423 that cant be a start char.
2424 */
2425 fail[ 0 ] = fail[ 1 ] = 0;
2426 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2427 PerlIO_printf(Perl_debug_log,
2428 "%*sStclass Failtable (%"UVuf" states): 0",
2429 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2430 );
786e8c11
YO
2431 for( q_read=1; q_read<numstates; q_read++ ) {
2432 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2433 }
2434 PerlIO_printf(Perl_debug_log, "\n");
2435 });
2436 Safefree(q);
2437 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2438}
2439
786e8c11 2440
a3621e74 2441/*
5d1c421c
JH
2442 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2443 * These need to be revisited when a newer toolchain becomes available.
2444 */
2445#if defined(__sparc64__) && defined(__GNUC__)
2446# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2447# undef SPARC64_GCC_WORKAROUND
2448# define SPARC64_GCC_WORKAROUND 1
2449# endif
2450#endif
2451
07be1b83 2452#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2453 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2454 SV * const mysv=sv_newmortal(); \
2455 regnode *Next = regnext(scan); \
2456 regprop(RExC_rx, mysv, scan); \
7f69552c 2457 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2458 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2459 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2460 }});
07be1b83 2461
1de06328
YO
2462
2463
2464
2465
07be1b83
YO
2466#define JOIN_EXACT(scan,min,flags) \
2467 if (PL_regkind[OP(scan)] == EXACT) \
2468 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2469
be8e71aa 2470STATIC U32
07be1b83
YO
2471S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2472 /* Merge several consecutive EXACTish nodes into one. */
2473 regnode *n = regnext(scan);
2474 U32 stringok = 1;
2475 regnode *next = scan + NODE_SZ_STR(scan);
2476 U32 merged = 0;
2477 U32 stopnow = 0;
2478#ifdef DEBUGGING
2479 regnode *stop = scan;
72f13be8 2480 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2481#else
d47053eb
RGS
2482 PERL_UNUSED_ARG(depth);
2483#endif
7918f24d
NC
2484
2485 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2486#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2487 PERL_UNUSED_ARG(flags);
2488 PERL_UNUSED_ARG(val);
07be1b83 2489#endif
07be1b83
YO
2490 DEBUG_PEEP("join",scan,depth);
2491
2492 /* Skip NOTHING, merge EXACT*. */
2493 while (n &&
2494 ( PL_regkind[OP(n)] == NOTHING ||
2495 (stringok && (OP(n) == OP(scan))))
2496 && NEXT_OFF(n)
2497 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2498
2499 if (OP(n) == TAIL || n > next)
2500 stringok = 0;
2501 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2502 DEBUG_PEEP("skip:",n,depth);
2503 NEXT_OFF(scan) += NEXT_OFF(n);
2504 next = n + NODE_STEP_REGNODE;
2505#ifdef DEBUGGING
2506 if (stringok)
2507 stop = n;
2508#endif
2509 n = regnext(n);
2510 }
2511 else if (stringok) {
786e8c11 2512 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2513 regnode * const nnext = regnext(n);
2514
2515 DEBUG_PEEP("merg",n,depth);
2516
2517 merged++;
2518 if (oldl + STR_LEN(n) > U8_MAX)
2519 break;
2520 NEXT_OFF(scan) += NEXT_OFF(n);
2521 STR_LEN(scan) += STR_LEN(n);
2522 next = n + NODE_SZ_STR(n);
2523 /* Now we can overwrite *n : */
2524 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2525#ifdef DEBUGGING
2526 stop = next - 1;
2527#endif
2528 n = nnext;
2529 if (stopnow) break;
2530 }
2531
d47053eb
RGS
2532#ifdef EXPERIMENTAL_INPLACESCAN
2533 if (flags && !NEXT_OFF(n)) {
2534 DEBUG_PEEP("atch", val, depth);
2535 if (reg_off_by_arg[OP(n)]) {
2536 ARG_SET(n, val - n);
2537 }
2538 else {
2539 NEXT_OFF(n) = val - n;
2540 }
2541 stopnow = 1;
2542 }
07be1b83
YO
2543#endif
2544 }
ced7f090
KW
2545#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2546#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2547#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2548#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2549
2550 if (UTF
2f7f8cb1 2551 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2c2b7f86
KW
2552 && ( STR_LEN(scan) >= 6 ) )
2553 {
07be1b83
YO
2554 /*
2555 Two problematic code points in Unicode casefolding of EXACT nodes:
2556
2557 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2558 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2559
2560 which casefold to
2561
2562 Unicode UTF-8
2563
2564 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2565 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2566
2567 This means that in case-insensitive matching (or "loose matching",
2568 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2569 length of the above casefolded versions) can match a target string
2570 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2571 This would rather mess up the minimum length computation.
2572
2573 What we'll do is to look for the tail four bytes, and then peek
2574 at the preceding two bytes to see whether we need to decrease
2575 the minimum length by four (six minus two).
2576
2577 Thanks to the design of UTF-8, there cannot be false matches:
2578 A sequence of valid UTF-8 bytes cannot be a subsequence of
2579 another valid sequence of UTF-8 bytes.
2580
2581 */
2582 char * const s0 = STRING(scan), *s, *t;
2583 char * const s1 = s0 + STR_LEN(scan) - 1;
2584 char * const s2 = s1 - 4;
e294cc5d
JH
2585#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2586 const char t0[] = "\xaf\x49\xaf\x42";
2587#else
07be1b83 2588 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2589#endif
07be1b83
YO
2590 const char * const t1 = t0 + 3;
2591
2592 for (s = s0 + 2;
2593 s < s2 && (t = ninstr(s, s1, t0, t1));
2594 s = t + 4) {
e294cc5d
JH
2595#ifdef EBCDIC
2596 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2597 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2598#else
07be1b83
YO
2599 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2600 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2601#endif
07be1b83
YO
2602 *min -= 4;
2603 }
2604 }
2605
2606#ifdef DEBUGGING
2607 /* Allow dumping */
2608 n = scan + NODE_SZ_STR(scan);
2609 while (n <= stop) {
2610 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2611 OP(n) = OPTIMIZED;
2612 NEXT_OFF(n) = 0;
2613 }
2614 n++;
2615 }
2616#endif
2617 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2618 return stopnow;
2619}
2620
486ec47a 2621/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2622 Finds fixed substrings. */
2623
a0288114 2624/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2625 to the position after last scanned or to NULL. */
2626
40d049e4
YO
2627#define INIT_AND_WITHP \
2628 assert(!and_withp); \
2629 Newx(and_withp,1,struct regnode_charclass_class); \
2630 SAVEFREEPV(and_withp)
07be1b83 2631
b515a41d 2632/* this is a chain of data about sub patterns we are processing that
486ec47a 2633 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2634 we can simulate recursion without losing state. */
2635struct scan_frame;
2636typedef struct scan_frame {
2637 regnode *last; /* last node to process in this frame */
2638 regnode *next; /* next node to process when last is reached */
2639 struct scan_frame *prev; /*previous frame*/
2640 I32 stop; /* what stopparen do we use */
2641} scan_frame;
2642
304ee84b
YO
2643
2644#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2645
e1d1eefb
YO
2646#define CASE_SYNST_FNC(nAmE) \
2647case nAmE: \
2648 if (flags & SCF_DO_STCLASS_AND) { \
2649 for (value = 0; value < 256; value++) \
2650 if (!is_ ## nAmE ## _cp(value)) \
2651 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2652 } \
2653 else { \
2654 for (value = 0; value < 256; value++) \
2655 if (is_ ## nAmE ## _cp(value)) \
2656 ANYOF_BITMAP_SET(data->start_class, value); \
2657 } \
2658 break; \
2659case N ## nAmE: \
2660 if (flags & SCF_DO_STCLASS_AND) { \
2661 for (value = 0; value < 256; value++) \
2662 if (is_ ## nAmE ## _cp(value)) \
2663 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2664 } \
2665 else { \
2666 for (value = 0; value < 256; value++) \
2667 if (!is_ ## nAmE ## _cp(value)) \
2668 ANYOF_BITMAP_SET(data->start_class, value); \
2669 } \
2670 break
2671
2672
2673
76e3520e 2674STATIC I32
40d049e4 2675S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2676 I32 *minlenp, I32 *deltap,
40d049e4
YO
2677 regnode *last,
2678 scan_data_t *data,
2679 I32 stopparen,
2680 U8* recursed,
2681 struct regnode_charclass_class *and_withp,
2682 U32 flags, U32 depth)
c277df42
IZ
2683 /* scanp: Start here (read-write). */
2684 /* deltap: Write maxlen-minlen here. */
2685 /* last: Stop before this one. */
40d049e4
YO
2686 /* data: string data about the pattern */
2687 /* stopparen: treat close N as END */
2688 /* recursed: which subroutines have we recursed into */
2689 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2690{
97aff369 2691 dVAR;
c277df42
IZ
2692 I32 min = 0, pars = 0, code;
2693 regnode *scan = *scanp, *next;
2694 I32 delta = 0;
2695 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2696 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2697 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2698 scan_data_t data_fake;
a3621e74 2699 SV *re_trie_maxbuff = NULL;
786e8c11 2700 regnode *first_non_open = scan;
e2e6a0f1 2701 I32 stopmin = I32_MAX;
8aa23a47 2702 scan_frame *frame = NULL;
a3621e74 2703 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2704
7918f24d
NC
2705 PERL_ARGS_ASSERT_STUDY_CHUNK;
2706
13a24bad 2707#ifdef DEBUGGING
40d049e4 2708 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2709#endif
40d049e4 2710
786e8c11 2711 if ( depth == 0 ) {
40d049e4 2712 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2713 first_non_open=regnext(first_non_open);
2714 }
2715
b81d288d 2716
8aa23a47
YO
2717 fake_study_recurse:
2718 while ( scan && OP(scan) != END && scan < last ){
2719 /* Peephole optimizer: */
304ee84b 2720 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2721 DEBUG_PEEP("Peep",scan,depth);
2722 JOIN_EXACT(scan,&min,0);
2723
2724 /* Follow the next-chain of the current node and optimize
2725 away all the NOTHINGs from it. */
2726 if (OP(scan) != CURLYX) {
2727 const int max = (reg_off_by_arg[OP(scan)]
2728 ? I32_MAX
2729 /* I32 may be smaller than U16 on CRAYs! */
2730 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2731 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2732 int noff;
2733 regnode *n = scan;
2734
2735 /* Skip NOTHING and LONGJMP. */
2736 while ((n = regnext(n))
2737 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2738 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2739 && off + noff < max)
2740 off += noff;
2741 if (reg_off_by_arg[OP(scan)])
2742 ARG(scan) = off;
2743 else
2744 NEXT_OFF(scan) = off;
2745 }
a3621e74 2746
c277df42 2747
8aa23a47
YO
2748
2749 /* The principal pseudo-switch. Cannot be a switch, since we
2750 look into several different things. */
2751 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2752 || OP(scan) == IFTHEN) {
2753 next = regnext(scan);
2754 code = OP(scan);
2755 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2756
2757 if (OP(next) == code || code == IFTHEN) {
2758 /* NOTE - There is similar code to this block below for handling
2759 TRIE nodes on a re-study. If you change stuff here check there
2760 too. */
2761 I32 max1 = 0, min1 = I32_MAX, num = 0;
2762 struct regnode_charclass_class accum;
2763 regnode * const startbranch=scan;
2764
2765 if (flags & SCF_DO_SUBSTR)
304ee84b 2766 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 2767 if (flags & SCF_DO_STCLASS)
e755fd73 2768 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
2769
2770 while (OP(scan) == code) {
2771 I32 deltanext, minnext, f = 0, fake;
2772 struct regnode_charclass_class this_class;
2773
2774 num++;
2775 data_fake.flags = 0;
2776 if (data) {
2777 data_fake.whilem_c = data->whilem_c;
2778 data_fake.last_closep = data->last_closep;
2779 }
2780 else
2781 data_fake.last_closep = &fake;
58e23c8d
YO
2782
2783 data_fake.pos_delta = delta;
8aa23a47
YO
2784 next = regnext(scan);
2785 scan = NEXTOPER(scan);
2786 if (code != BRANCH)
c277df42 2787 scan = NEXTOPER(scan);
8aa23a47 2788 if (flags & SCF_DO_STCLASS) {
e755fd73 2789 cl_init(pRExC_state, &this_class);
8aa23a47
YO
2790 data_fake.start_class = &this_class;
2791 f = SCF_DO_STCLASS_AND;
58e23c8d 2792 }
8aa23a47
YO
2793 if (flags & SCF_WHILEM_VISITED_POS)
2794 f |= SCF_WHILEM_VISITED_POS;
2795
2796 /* we suppose the run is continuous, last=next...*/
2797 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2798 next, &data_fake,
2799 stopparen, recursed, NULL, f,depth+1);
2800 if (min1 > minnext)
2801 min1 = minnext;
2802 if (max1 < minnext + deltanext)
2803 max1 = minnext + deltanext;
2804 if (deltanext == I32_MAX)
2805 is_inf = is_inf_internal = 1;
2806 scan = next;
2807 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2808 pars++;
2809 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2810 if ( stopmin > minnext)
2811 stopmin = min + min1;
2812 flags &= ~SCF_DO_SUBSTR;
2813 if (data)
2814 data->flags |= SCF_SEEN_ACCEPT;
2815 }
2816 if (data) {
2817 if (data_fake.flags & SF_HAS_EVAL)
2818 data->flags |= SF_HAS_EVAL;
2819 data->whilem_c = data_fake.whilem_c;
3dab1dad 2820 }
8aa23a47 2821 if (flags & SCF_DO_STCLASS)
3fffb88a 2822 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
2823 }
2824 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2825 min1 = 0;
2826 if (flags & SCF_DO_SUBSTR) {
2827 data->pos_min += min1;
2828 data->pos_delta += max1 - min1;
2829 if (max1 != min1 || is_inf)
2830 data->longest = &(data->longest_float);
2831 }
2832 min += min1;
2833 delta += max1 - min1;
2834 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 2835 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
2836 if (min1) {
2837 cl_and(data->start_class, and_withp);
2838 flags &= ~SCF_DO_STCLASS;
653099ff 2839 }
8aa23a47
YO
2840 }
2841 else if (flags & SCF_DO_STCLASS_AND) {
2842 if (min1) {
2843 cl_and(data->start_class, &accum);
2844 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2845 }
8aa23a47
YO
2846 else {
2847 /* Switch to OR mode: cache the old value of
2848 * data->start_class */
2849 INIT_AND_WITHP;
2850 StructCopy(data->start_class, and_withp,
2851 struct regnode_charclass_class);
2852 flags &= ~SCF_DO_STCLASS_AND;
2853 StructCopy(&accum, data->start_class,
2854 struct regnode_charclass_class);
2855 flags |= SCF_DO_STCLASS_OR;
2856 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2857 }
8aa23a47 2858 }
a3621e74 2859
8aa23a47
YO
2860 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2861 /* demq.
a3621e74 2862
8aa23a47
YO
2863 Assuming this was/is a branch we are dealing with: 'scan' now
2864 points at the item that follows the branch sequence, whatever
2865 it is. We now start at the beginning of the sequence and look
2866 for subsequences of
a3621e74 2867
8aa23a47
YO
2868 BRANCH->EXACT=>x1
2869 BRANCH->EXACT=>x2
2870 tail
a3621e74 2871
8aa23a47 2872 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2873
486ec47a 2874 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2875 element into a trie and then add the subsequent branch exact
2876 strings to the trie.
a3621e74 2877
8aa23a47 2878 We have two cases
a3621e74 2879
3b753521 2880 1. patterns where the whole set of branches can be converted.
a3621e74 2881
8aa23a47 2882 2. patterns where only a subset can be converted.
a3621e74 2883
8aa23a47
YO
2884 In case 1 we can replace the whole set with a single regop
2885 for the trie. In case 2 we need to keep the start and end
3b753521 2886 branches so
a3621e74 2887
8aa23a47
YO
2888 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2889 becomes BRANCH TRIE; BRANCH X;
786e8c11 2890
8aa23a47
YO
2891 There is an additional case, that being where there is a
2892 common prefix, which gets split out into an EXACT like node
2893 preceding the TRIE node.
a3621e74 2894
8aa23a47
YO
2895 If x(1..n)==tail then we can do a simple trie, if not we make
2896 a "jump" trie, such that when we match the appropriate word
486ec47a 2897 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2898 a nested if into a case structure of sorts.
b515a41d 2899
8aa23a47
YO
2900 */
2901
2902 int made=0;
2903 if (!re_trie_maxbuff) {
2904 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2905 if (!SvIOK(re_trie_maxbuff))
2906 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2907 }
2908 if ( SvIV(re_trie_maxbuff)>=0 ) {
2909 regnode *cur;
2910 regnode *first = (regnode *)NULL;
2911 regnode *last = (regnode *)NULL;
2912 regnode *tail = scan;
2913 U8 optype = 0;
2914 U32 count=0;
a3621e74
YO
2915
2916#ifdef DEBUGGING
8aa23a47 2917 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2918#endif
8aa23a47
YO
2919 /* var tail is used because there may be a TAIL
2920 regop in the way. Ie, the exacts will point to the
2921 thing following the TAIL, but the last branch will
2922 point at the TAIL. So we advance tail. If we
2923 have nested (?:) we may have to move through several
2924 tails.
2925 */
2926
2927 while ( OP( tail ) == TAIL ) {
2928 /* this is the TAIL generated by (?:) */
2929 tail = regnext( tail );
2930 }
a3621e74 2931
8aa23a47
YO
2932
2933 DEBUG_OPTIMISE_r({
2934 regprop(RExC_rx, mysv, tail );
2935 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2936 (int)depth * 2 + 2, "",
2937 "Looking for TRIE'able sequences. Tail node is: ",
2938 SvPV_nolen_const( mysv )
2939 );
2940 });
2941
2942 /*
2943
2944 step through the branches, cur represents each
2945 branch, noper is the first thing to be matched
2946 as part of that branch and noper_next is the
2947 regnext() of that node. if noper is an EXACT
2948 and noper_next is the same as scan (our current
2949 position in the regex) then the EXACT branch is
2950 a possible optimization target. Once we have
486ec47a 2951 two or more consecutive such branches we can
8aa23a47
YO
2952 create a trie of the EXACT's contents and stich
2953 it in place. If the sequence represents all of
2954 the branches we eliminate the whole thing and
2955 replace it with a single TRIE. If it is a
2956 subsequence then we need to stitch it in. This
2957 means the first branch has to remain, and needs
2958 to be repointed at the item on the branch chain
2959 following the last branch optimized. This could
2960 be either a BRANCH, in which case the
2961 subsequence is internal, or it could be the
2962 item following the branch sequence in which
2963 case the subsequence is at the end.
2964
2965 */
2966
2967 /* dont use tail as the end marker for this traverse */
2968 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2969 regnode * const noper = NEXTOPER( cur );
b515a41d 2970#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2971 regnode * const noper_next = regnext( noper );
b515a41d
YO
2972#endif
2973
8aa23a47
YO
2974 DEBUG_OPTIMISE_r({
2975 regprop(RExC_rx, mysv, cur);
2976 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2977 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2978
2979 regprop(RExC_rx, mysv, noper);
2980 PerlIO_printf( Perl_debug_log, " -> %s",
2981 SvPV_nolen_const(mysv));
2982
2983 if ( noper_next ) {
2984 regprop(RExC_rx, mysv, noper_next );
2985 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2986 SvPV_nolen_const(mysv));
2987 }
2988 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2989 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2990 });
2991 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2992 : PL_regkind[ OP( noper ) ] == EXACT )
2993 || OP(noper) == NOTHING )
786e8c11 2994#ifdef NOJUMPTRIE
8aa23a47 2995 && noper_next == tail
786e8c11 2996#endif
8aa23a47
YO
2997 && count < U16_MAX)
2998 {
2999 count++;
3000 if ( !first || optype == NOTHING ) {
3001 if (!first) first = cur;
3002 optype = OP( noper );
3003 } else {
3004 last = cur;
3005 }
3006 } else {
a0a388a1 3007/*
0abd0d78
YO
3008 Currently we do not believe that the trie logic can
3009 handle case insensitive matching properly when the
3010 pattern is not unicode (thus forcing unicode semantics).
3011
3012 If/when this is fixed the following define can be swapped
3013 in below to fully enable trie logic.
3014
f0c16e54
KW
3015 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3016 not /aa
3017
a0a388a1 3018#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
3019
3020*/
f0c16e54 3021#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
0abd0d78 3022
a0a388a1 3023 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
3024 make_trie( pRExC_state,
3025 startbranch, first, cur, tail, count,
3026 optype, depth+1 );
3027 }
3028 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 3029#ifdef NOJUMPTRIE
8aa23a47 3030 && noper_next == tail
786e8c11 3031#endif
8aa23a47
YO
3032 ){
3033 count = 1;
3034 first = cur;
3035 optype = OP( noper );
3036 } else {
3037 count = 0;
3038 first = NULL;
3039 optype = 0;
3040 }
3041 last = NULL;
3042 }
3043 }
3044 DEBUG_OPTIMISE_r({
3045 regprop(RExC_rx, mysv, cur);
3046 PerlIO_printf( Perl_debug_log,
3047 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3048 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3049
3050 });
a0a388a1
YO
3051
3052 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 3053 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 3054#ifdef TRIE_STUDY_OPT
8aa23a47
YO
3055 if ( ((made == MADE_EXACT_TRIE &&
3056 startbranch == first)
3057 || ( first_non_open == first )) &&
3058 depth==0 ) {
3059 flags |= SCF_TRIE_RESTUDY;
3060 if ( startbranch == first
3061 && scan == tail )
3062 {
3063 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3064 }
3065 }
3dab1dad 3066#endif
8aa23a47
YO
3067 }
3068 }
3069
3070 } /* do trie */
3071
653099ff 3072 }
8aa23a47
YO
3073 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3074 scan = NEXTOPER(NEXTOPER(scan));
3075 } else /* single branch is optimized. */
3076 scan = NEXTOPER(scan);
3077 continue;
3078 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3079 scan_frame *newframe = NULL;
3080 I32 paren;
3081 regnode *start;
3082 regnode *end;
3083
3084 if (OP(scan) != SUSPEND) {
3085 /* set the pointer */
3086 if (OP(scan) == GOSUB) {
3087 paren = ARG(scan);
3088 RExC_recurse[ARG2L(scan)] = scan;
3089 start = RExC_open_parens[paren-1];
3090 end = RExC_close_parens[paren-1];
3091 } else {
3092 paren = 0;
f8fc2ecf 3093 start = RExC_rxi->program + 1;
8aa23a47
YO
3094 end = RExC_opend;
3095 }
3096 if (!recursed) {
3097 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3098 SAVEFREEPV(recursed);
3099 }
3100 if (!PAREN_TEST(recursed,paren+1)) {
3101 PAREN_SET(recursed,paren+1);
3102 Newx(newframe,1,scan_frame);
3103 } else {
3104 if (flags & SCF_DO_SUBSTR) {
304ee84b 3105 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3106 data->longest = &(data->longest_float);
3107 }
3108 is_inf = is_inf_internal = 1;
3109 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3110 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3111 flags &= ~SCF_DO_STCLASS;
3112 }
3113 } else {
3114 Newx(newframe,1,scan_frame);
3115 paren = stopparen;
3116 start = scan+2;
3117 end = regnext(scan);
3118 }
3119 if (newframe) {
3120 assert(start);
3121 assert(end);
3122 SAVEFREEPV(newframe);
3123 newframe->next = regnext(scan);
3124 newframe->last = last;
3125 newframe->stop = stopparen;
3126 newframe->prev = frame;
3127
3128 frame = newframe;
3129 scan = start;
3130 stopparen = paren;
3131 last = end;
3132
3133 continue;
3134 }
3135 }
3136 else if (OP(scan) == EXACT) {
3137 I32 l = STR_LEN(scan);
3138 UV uc;
3139 if (UTF) {
3140 const U8 * const s = (U8*)STRING(scan);
3141 l = utf8_length(s, s + l);
3142 uc = utf8_to_uvchr(s, NULL);
3143 } else {
3144 uc = *((U8*)STRING(scan));
3145 }
3146 min += l;
3147 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3148 /* The code below prefers earlier match for fixed
3149 offset, later match for variable offset. */
3150 if (data->last_end == -1) { /* Update the start info. */
3151 data->last_start_min = data->pos_min;
3152 data->last_start_max = is_inf
3153 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3154 }
8aa23a47
YO
3155 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3156 if (UTF)
3157 SvUTF8_on(data->last_found);
3158 {
3159 SV * const sv = data->last_found;
3160 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3161 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3162 if (mg && mg->mg_len >= 0)
3163 mg->mg_len += utf8_length((U8*)STRING(scan),
3164 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3165 }
8aa23a47
YO
3166 data->last_end = data->pos_min + l;
3167 data->pos_min += l; /* As in the first entry. */
3168 data->flags &= ~SF_BEFORE_EOL;
3169 }
3170 if (flags & SCF_DO_STCLASS_AND) {
3171 /* Check whether it is compatible with what we know already! */
3172 int compat = 1;
3173
54251c2e 3174
486ec47a 3175 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3176 * in the bitmp and either 1) its bit or its fold is set, or 2)
3177 * it's for a locale. Even if there isn't unicode semantics
3178 * here, at runtime there may be because of matching against a
3179 * utf8 string, so accept a possible false positive for
3180 * latin1-range folds */
8aa23a47
YO
3181 if (uc >= 0x100 ||
3182 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3183 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3184 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3185 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3186 )
d18bf9dc 3187 {
8aa23a47 3188 compat = 0;
d18bf9dc 3189 }
8aa23a47
YO
3190 ANYOF_CLASS_ZERO(data->start_class);
3191 ANYOF_BITMAP_ZERO(data->start_class);
3192 if (compat)
3193 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3194 else if (uc >= 0x100) {
3195 int i;
3196
3197 /* Some Unicode code points fold to the Latin1 range; as
3198 * XXX temporary code, instead of figuring out if this is
3199 * one, just assume it is and set all the start class bits
3200 * that could be some such above 255 code point's fold
3201 * which will generate fals positives. As the code
3202 * elsewhere that does compute the fold settles down, it
3203 * can be extracted out and re-used here */
3204 for (i = 0; i < 256; i++){
3205 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3206 ANYOF_BITMAP_SET(data->start_class, i);
3207 }
3208 }
3209 }
8aa23a47
YO
3210 data->start_class->flags &= ~ANYOF_EOS;
3211 if (uc < 0x100)
3212 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3213 }
3214 else if (flags & SCF_DO_STCLASS_OR) {
3215 /* false positive possible if the class is case-folded */
3216 if (uc < 0x100)
3217 ANYOF_BITMAP_SET(data->start_class, uc);
3218 else
3219 data->start_class->flags |= ANYOF_UNICODE_ALL;
3220 data->start_class->flags &= ~ANYOF_EOS;
3221 cl_and(data->start_class, and_withp);
3222 }
3223 flags &= ~SCF_DO_STCLASS;
3224 }
3225 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3226 I32 l = STR_LEN(scan);
3227 UV uc = *((U8*)STRING(scan));
3228
3229 /* Search for fixed substrings supports EXACT only. */
3230 if (flags & SCF_DO_SUBSTR) {
3231 assert(data);
304ee84b 3232 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3233 }
3234 if (UTF) {
3235 const U8 * const s = (U8 *)STRING(scan);
3236 l = utf8_length(s, s + l);
3237 uc = utf8_to_uvchr(s, NULL);
3238 }
3239 min += l;
3240 if (flags & SCF_DO_SUBSTR)
3241 data->pos_min += l;
3242 if (flags & SCF_DO_STCLASS_AND) {
3243 /* Check whether it is compatible with what we know already! */
3244 int compat = 1;
8aa23a47 3245 if (uc >= 0x100 ||
54251c2e
KW
3246 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3247 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3248 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3249 {
8aa23a47 3250 compat = 0;
54251c2e 3251 }
8aa23a47
YO
3252 ANYOF_CLASS_ZERO(data->start_class);
3253 ANYOF_BITMAP_ZERO(data->start_class);
3254 if (compat) {
3255 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3256 data->start_class->flags &= ~ANYOF_EOS;
39065660 3257 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3258 if (OP(scan) == EXACTFL) {
af302e7f
KW
3259 /* XXX This set is probably no longer necessary, and
3260 * probably wrong as LOCALE now is on in the initial
3261 * state */
8aa23a47 3262 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3263 }
3264 else {
3265
54251c2e
KW
3266 /* Also set the other member of the fold pair. In case
3267 * that unicode semantics is called for at runtime, use
3268 * the full latin1 fold. (Can't do this for locale,
3269 * because not known until runtime */
3270 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3271 }
653099ff 3272 }
d18bf9dc
KW
3273 else if (uc >= 0x100) {
3274 int i;
3275 for (i = 0; i < 256; i++){
3276 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3277 ANYOF_BITMAP_SET(data->start_class, i);
3278 }
3279 }
3280 }
8aa23a47
YO
3281 }
3282 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3283 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3284 /* false positive possible if the class is case-folded.
3285 Assume that the locale settings are the same... */
970c8436 3286 if (uc < 0x100) {
1aa99e6b 3287 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3288 if (OP(scan) != EXACTFL) {
3289
3290 /* And set the other member of the fold pair, but
3291 * can't do that in locale because not known until
3292 * run-time */
3293 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3294 PL_fold_latin1[uc]);
970c8436
KW
3295 }
3296 }
653099ff
GS
3297 data->start_class->flags &= ~ANYOF_EOS;
3298 }
8aa23a47 3299 cl_and(data->start_class, and_withp);
653099ff 3300 }
8aa23a47
YO
3301 flags &= ~SCF_DO_STCLASS;
3302 }
e52fc539 3303 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3304 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3305 I32 f = flags, pos_before = 0;
3306 regnode * const oscan = scan;
3307 struct regnode_charclass_class this_class;
3308 struct regnode_charclass_class *oclass = NULL;
3309 I32 next_is_eval = 0;
3310
3311 switch (PL_regkind[OP(scan)]) {
3312 case WHILEM: /* End of (?:...)* . */
3313 scan = NEXTOPER(scan);
3314 goto finish;
3315 case PLUS:
3316 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3317 next = NEXTOPER(scan);
3318 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3319 mincount = 1;
3320 maxcount = REG_INFTY;
3321 next = regnext(scan);
3322 scan = NEXTOPER(scan);
3323 goto do_curly;
3324 }
3325 }
3326 if (flags & SCF_DO_SUBSTR)
3327 data->pos_min++;
3328 min++;
3329 /* Fall through. */
3330 case STAR:
3331 if (flags & SCF_DO_STCLASS) {
3332 mincount = 0;
3333 maxcount = REG_INFTY;
3334 next = regnext(scan);
3335 scan = NEXTOPER(scan);
3336 goto do_curly;
3337 }
3338 is_inf = is_inf_internal = 1;
3339 scan = regnext(scan);
c277df42 3340 if (flags & SCF_DO_SUBSTR) {
304ee84b 3341 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3342 data->longest = &(data->longest_float);
c277df42 3343 }
8aa23a47
YO
3344 goto optimize_curly_tail;
3345 case CURLY:
3346 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3347 && (scan->flags == stopparen))
3348 {
3349 mincount = 1;
3350 maxcount = 1;
3351 } else {
3352 mincount = ARG1(scan);
3353 maxcount = ARG2(scan);
653099ff 3354 }
8aa23a47
YO
3355 next = regnext(scan);
3356 if (OP(scan) == CURLYX) {
3357 I32 lp = (data ? *(data->last_closep) : 0);
3358 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3359 }
8aa23a47
YO
3360 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3361 next_is_eval = (OP(scan) == EVAL);
3362 do_curly:
3363 if (flags & SCF_DO_SUBSTR) {
304ee84b 3364 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3365 pos_before = data->pos_min;
b45f050a 3366 }
8aa23a47
YO
3367 if (data) {
3368 fl = data->flags;
3369 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3370 if (is_inf)
3371 data->flags |= SF_IS_INF;
3372 }
3373 if (flags & SCF_DO_STCLASS) {
e755fd73 3374 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3375 oclass = data->start_class;
3376 data->start_class = &this_class;
3377 f |= SCF_DO_STCLASS_AND;
3378 f &= ~SCF_DO_STCLASS_OR;
3379 }
779bcb7d
NC
3380 /* Exclude from super-linear cache processing any {n,m}
3381 regops for which the combination of input pos and regex
3382 pos is not enough information to determine if a match
3383 will be possible.
3384
3385 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3386 regex pos at the \s*, the prospects for a match depend not
3387 only on the input position but also on how many (bar\s*)
3388 repeats into the {4,8} we are. */
3389 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3390 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3391
8aa23a47
YO
3392 /* This will finish on WHILEM, setting scan, or on NULL: */
3393 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3394 last, data, stopparen, recursed, NULL,
3395 (mincount == 0
3396 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3397
8aa23a47
YO
3398 if (flags & SCF_DO_STCLASS)
3399 data->start_class = oclass;
3400 if (mincount == 0 || minnext == 0) {
3401 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3402 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3403 }
3404 else if (flags & SCF_DO_STCLASS_AND) {
3405 /* Switch to OR mode: cache the old value of
3406 * data->start_class */
3407 INIT_AND_WITHP;
3408 StructCopy(data->start_class, and_withp,
3409 struct regnode_charclass_class);
3410 flags &= ~SCF_DO_STCLASS_AND;
3411 StructCopy(&this_class, data->start_class,
3412 struct regnode_charclass_class);
3413 flags |= SCF_DO_STCLASS_OR;
3414 data->start_class->flags |= ANYOF_EOS;
3415 }
3416 } else { /* Non-zero len */
3417 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3418 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3419 cl_and(data->start_class, and_withp);
3420 }
3421 else if (flags & SCF_DO_STCLASS_AND)
3422 cl_and(data->start_class, &this_class);
3423 flags &= ~SCF_DO_STCLASS;
3424 }
3425 if (!scan) /* It was not CURLYX, but CURLY. */
3426 scan = next;
3427 if ( /* ? quantifier ok, except for (?{ ... }) */
3428 (next_is_eval || !(mincount == 0 && maxcount == 1))
3429 && (minnext == 0) && (deltanext == 0)
3430 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3431 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3432 {
668c081a
NC
3433 ckWARNreg(RExC_parse,
3434 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3435 }
3436
3437 min += minnext * mincount;
3438 is_inf_internal |= ((maxcount == REG_INFTY
3439 && (minnext + deltanext) > 0)
3440 || deltanext == I32_MAX);
3441 is_inf |= is_inf_internal;
3442 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3443
3444 /* Try powerful optimization CURLYX => CURLYN. */
3445 if ( OP(oscan) == CURLYX && data
3446 && data->flags & SF_IN_PAR
3447 && !(data->flags & SF_HAS_EVAL)
3448 && !deltanext && minnext == 1 ) {
3449 /* Try to optimize to CURLYN. */
3450 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3451 regnode * const nxt1 = nxt;
497b47a8 3452#ifdef DEBUGGING
8aa23a47 3453 regnode *nxt2;
497b47a8 3454#endif
c277df42 3455
8aa23a47
YO
3456 /* Skip open. */
3457 nxt = regnext(nxt);
e52fc539 3458 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3459 && !(PL_regkind[OP(nxt)] == EXACT
3460 && STR_LEN(nxt) == 1))
3461 goto nogo;
497b47a8 3462#ifdef DEBUGGING
8aa23a47 3463 nxt2 = nxt;
497b47a8 3464#endif
8aa23a47
YO
3465 nxt = regnext(nxt);
3466 if (OP(nxt) != CLOSE)
3467 goto nogo;
3468 if (RExC_open_parens) {
3469 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3470 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3471 }
3472 /* Now we know that nxt2 is the only contents: */
3473 oscan->flags = (U8)ARG(nxt);
3474 OP(oscan) = CURLYN;
3475 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3476
c277df42 3477#ifdef DEBUGGING
8aa23a47 3478 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3479 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3480 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3481 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3482 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3483 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3484#endif
8aa23a47
YO
3485 }
3486 nogo:
3487
3488 /* Try optimization CURLYX => CURLYM. */
3489 if ( OP(oscan) == CURLYX && data
3490 && !(data->flags & SF_HAS_PAR)
3491 && !(data->flags & SF_HAS_EVAL)
3492 && !deltanext /* atom is fixed width */
3493 && minnext != 0 /* CURLYM can't handle zero width */
3494 ) {
3495 /* XXXX How to optimize if data == 0? */
3496 /* Optimize to a simpler form. */
3497 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3498 regnode *nxt2;
3499
3500 OP(oscan) = CURLYM;
3501 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3502 && (OP(nxt2) != WHILEM))
3503 nxt = nxt2;
3504 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3505 /* Need to optimize away parenths. */
b3c0965f 3506 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3507 /* Set the parenth number. */
3508 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3509
8aa23a47
YO
3510 oscan->flags = (U8)ARG(nxt);
3511 if (RExC_open_parens) {
3512 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3513 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3514 }
8aa23a47
YO
3515 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3516 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3517
c277df42 3518#ifdef DEBUGGING
8aa23a47
YO
3519 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3520 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3521 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3522 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3523#endif
c277df42 3524#if 0
8aa23a47
YO
3525 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3526 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3527 if (nnxt == nxt) {
3528 if (reg_off_by_arg[OP(nxt1)])
3529 ARG_SET(nxt1, nxt2 - nxt1);
3530 else if (nxt2 - nxt1 < U16_MAX)
3531 NEXT_OFF(nxt1) = nxt2 - nxt1;
3532 else
3533 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3534 }
8aa23a47 3535 nxt1 = nnxt;
c277df42 3536 }
5d1c421c 3537#endif
8aa23a47
YO
3538 /* Optimize again: */
3539 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3540 NULL, stopparen, recursed, NULL, 0,depth+1);
3541 }
3542 else
3543 oscan->flags = 0;
3544 }
3545 else if ((OP(oscan) == CURLYX)
3546 && (flags & SCF_WHILEM_VISITED_POS)
3547 /* See the comment on a similar expression above.
3b753521 3548 However, this time it's not a subexpression
8aa23a47
YO
3549 we care about, but the expression itself. */
3550 && (maxcount == REG_INFTY)
3551 && data && ++data->whilem_c < 16) {
3552 /* This stays as CURLYX, we can put the count/of pair. */
3553 /* Find WHILEM (as in regexec.c) */
3554 regnode *nxt = oscan + NEXT_OFF(oscan);
3555
3556 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3557 nxt += ARG(nxt);
3558 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3559 | (RExC_whilem_seen << 4)); /* On WHILEM */
3560 }
3561 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3562 pars++;
3563 if (flags & SCF_DO_SUBSTR) {
3564 SV *last_str = NULL;
3565 int counted = mincount != 0;
a0ed51b3 3566
8aa23a47
YO
3567 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3568#if defined(SPARC64_GCC_WORKAROUND)
3569 I32 b = 0;
3570 STRLEN l = 0;
3571 const char *s = NULL;
3572 I32 old = 0;
b515a41d 3573
8aa23a47
YO
3574 if (pos_before >= data->last_start_min)
3575 b = pos_before;
3576 else
3577 b = data->last_start_min;
b515a41d 3578
8aa23a47
YO
3579 l = 0;
3580 s = SvPV_const(data->last_found, l);
3581 old = b - data->last_start_min;
3582
3583#else
3584 I32 b = pos_before >= data->last_start_min
3585 ? pos_before : data->last_start_min;
3586 STRLEN l;
3587 const char * const s = SvPV_const(data->last_found, l);
3588 I32 old = b - data->last_start_min;
3589#endif
3590
3591 if (UTF)
3592 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3593 l -= old;
3594 /* Get the added string: */
740cce10 3595 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3596 if (deltanext == 0 && pos_before == b) {
3597 /* What was added is a constant string */
3598 if (mincount > 1) {
3599 SvGROW(last_str, (mincount * l) + 1);
3600 repeatcpy(SvPVX(last_str) + l,
3601 SvPVX_const(last_str), l, mincount - 1);
3602 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3603 /* Add additional parts. */
3604 SvCUR_set(data->last_found,
3605 SvCUR(data->last_found) - l);
3606 sv_catsv(data->last_found, last_str);
3607 {
3608 SV * sv = data->last_found;
3609 MAGIC *mg =
3610 SvUTF8(sv) && SvMAGICAL(sv) ?
3611 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3612 if (mg && mg->mg_len >= 0)
bd94e887 3613 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3614 }
8aa23a47 3615 data->last_end += l * (mincount - 1);
b515a41d 3616 }
8aa23a47
YO
3617 } else {
3618 /* start offset must point into the last copy */
3619 data->last_start_min += minnext * (mincount - 1);
3620 data->last_start_max += is_inf ? I32_MAX
3621 : (maxcount - 1) * (minnext + data->pos_delta);
3622 }
c277df42 3623 }
8aa23a47
YO
3624 /* It is counted once already... */
3625 data->pos_min += minnext * (mincount - counted);
3626 data->pos_delta += - counted * deltanext +
3627 (minnext + deltanext) * maxcount - minnext * mincount;
3628 if (mincount != maxcount) {
3629 /* Cannot extend fixed substrings found inside
3630 the group. */
304ee84b 3631 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3632 if (mincount && last_str) {
3633 SV * const sv = data->last_found;
3634 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3635 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3636
3637 if (mg)
3638 mg->mg_len = -1;
3639 sv_setsv(sv, last_str);
3640 data->last_end = data->pos_min;
3641 data->last_start_min =
3642 data->pos_min - CHR_SVLEN(last_str);
3643 data->last_start_max = is_inf
3644 ? I32_MAX
3645 : data->pos_min + data->pos_delta
3646 - CHR_SVLEN(last_str);
3647 }
3648 data->longest = &(data->longest_float);
3649 }
3650 SvREFCNT_dec(last_str);
c277df42 3651 }
8aa23a47
YO
3652 if (data && (fl & SF_HAS_EVAL))
3653 data->flags |= SF_HAS_EVAL;
3654 optimize_curly_tail:
3655 if (OP(oscan) != CURLYX) {
3656 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3657 && NEXT_OFF(next))
3658 NEXT_OFF(oscan) += NEXT_OFF(next);
3659 }
3660 continue;
f56b6394 3661 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3662 if (flags & SCF_DO_SUBSTR) {
304ee84b 3663 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3664 data->longest = &(data->longest_float);
3665 }
3666 is_inf = is_inf_internal = 1;
3667 if (flags & SCF_DO_STCLASS_OR)
3fffb88a 3668 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3669 flags &= ~SCF_DO_STCLASS;
3670 break;
c277df42 3671 }
8aa23a47 3672 }
e1d1eefb
YO
3673 else if (OP(scan) == LNBREAK) {
3674 if (flags & SCF_DO_STCLASS) {
3675 int value = 0;
3676 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3677 if (flags & SCF_DO_STCLASS_AND) {