This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
charset.t: Skip locale tests depending on platform
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
4624b182 145 I32 contains_locale;
830247a4
IZ
146#if ADD_TO_REGEXEC
147 char *starttry; /* -Dr: where regtry was called. */
148#define RExC_starttry (pRExC_state->starttry)
149#endif
3dab1dad 150#ifdef DEBUGGING
be8e71aa 151 const char *lastparse;
3dab1dad 152 I32 lastnum;
1f1031fe 153 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
154#define RExC_lastparse (pRExC_state->lastparse)
155#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 156#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 157#endif
830247a4
IZ
158} RExC_state_t;
159
e2509266 160#define RExC_flags (pRExC_state->flags)
830247a4 161#define RExC_precomp (pRExC_state->precomp)
288b8c02 162#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 163#define RExC_rx (pRExC_state->rx)
f8fc2ecf 164#define RExC_rxi (pRExC_state->rxi)
fac92740 165#define RExC_start (pRExC_state->start)
830247a4
IZ
166#define RExC_end (pRExC_state->end)
167#define RExC_parse (pRExC_state->parse)
168#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
169#ifdef RE_TRACK_PATTERN_OFFSETS
170#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
171#endif
830247a4 172#define RExC_emit (pRExC_state->emit)
fac92740 173#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 174#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
175#define RExC_naughty (pRExC_state->naughty)
176#define RExC_sawback (pRExC_state->sawback)
177#define RExC_seen (pRExC_state->seen)
178#define RExC_size (pRExC_state->size)
179#define RExC_npar (pRExC_state->npar)
e2e6a0f1 180#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
181#define RExC_extralen (pRExC_state->extralen)
182#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
183#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 184#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 185#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 186#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
187#define RExC_open_parens (pRExC_state->open_parens)
188#define RExC_close_parens (pRExC_state->close_parens)
189#define RExC_opend (pRExC_state->opend)
81714fb9 190#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
191#define RExC_recurse (pRExC_state->recurse)
192#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 193#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 194#define RExC_contains_locale (pRExC_state->contains_locale)
830247a4 195
cde0cee5 196
a687059c
LW
197#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
198#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
199 ((*s) == '{' && regcurly(s)))
a687059c 200
35c8bce7
LW
201#ifdef SPSTART
202#undef SPSTART /* dratted cpp namespace... */
203#endif
a687059c
LW
204/*
205 * Flags to be passed up and down.
206 */
a687059c 207#define WORST 0 /* Worst case. */
a3b492c3 208#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
209
210/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 211 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 212#define SIMPLE 0x02
a3b492c3
YO
213#define SPSTART 0x04 /* Starts with * or +. */
214#define TRYAGAIN 0x08 /* Weeded out a declaration. */
215#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 216
3dab1dad
YO
217#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
218
07be1b83
YO
219/* whether trie related optimizations are enabled */
220#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
221#define TRIE_STUDY_OPT
786e8c11 222#define FULL_TRIE_STUDY
07be1b83
YO
223#define TRIE_STCLASS
224#endif
1de06328
YO
225
226
40d049e4
YO
227
228#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
229#define PBITVAL(paren) (1 << ((paren) & 7))
230#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
231#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
232#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
233
bbd61b5f
KW
234/* If not already in utf8, do a longjmp back to the beginning */
235#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
236#define REQUIRE_UTF8 STMT_START { \
237 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
238 } STMT_END
40d049e4 239
1de06328
YO
240/* About scan_data_t.
241
242 During optimisation we recurse through the regexp program performing
243 various inplace (keyhole style) optimisations. In addition study_chunk
244 and scan_commit populate this data structure with information about
245 what strings MUST appear in the pattern. We look for the longest
3b753521 246 string that must appear at a fixed location, and we look for the
1de06328
YO
247 longest string that may appear at a floating location. So for instance
248 in the pattern:
249
250 /FOO[xX]A.*B[xX]BAR/
251
252 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
253 strings (because they follow a .* construct). study_chunk will identify
254 both FOO and BAR as being the longest fixed and floating strings respectively.
255
256 The strings can be composites, for instance
257
258 /(f)(o)(o)/
259
260 will result in a composite fixed substring 'foo'.
261
262 For each string some basic information is maintained:
263
264 - offset or min_offset
265 This is the position the string must appear at, or not before.
266 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
267 characters must match before the string we are searching for.
268 Likewise when combined with minlenp and the length of the string it
1de06328
YO
269 tells us how many characters must appear after the string we have
270 found.
271
272 - max_offset
273 Only used for floating strings. This is the rightmost point that
3b753521 274 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
275 string can occur infinitely far to the right.
276
277 - minlenp
278 A pointer to the minimum length of the pattern that the string
279 was found inside. This is important as in the case of positive
280 lookahead or positive lookbehind we can have multiple patterns
281 involved. Consider
282
283 /(?=FOO).*F/
284
285 The minimum length of the pattern overall is 3, the minimum length
286 of the lookahead part is 3, but the minimum length of the part that
287 will actually match is 1. So 'FOO's minimum length is 3, but the
288 minimum length for the F is 1. This is important as the minimum length
289 is used to determine offsets in front of and behind the string being
290 looked for. Since strings can be composites this is the length of the
486ec47a 291 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
292 the length is calculated by study_chunk, so that the minimum lengths
293 are not known until the full pattern has been compiled, thus the
294 pointer to the value.
295
296 - lookbehind
297
298 In the case of lookbehind the string being searched for can be
299 offset past the start point of the final matching string.
300 If this value was just blithely removed from the min_offset it would
301 invalidate some of the calculations for how many chars must match
302 before or after (as they are derived from min_offset and minlen and
303 the length of the string being searched for).
304 When the final pattern is compiled and the data is moved from the
305 scan_data_t structure into the regexp structure the information
306 about lookbehind is factored in, with the information that would
307 have been lost precalculated in the end_shift field for the
308 associated string.
309
310 The fields pos_min and pos_delta are used to store the minimum offset
311 and the delta to the maximum offset at the current point in the pattern.
312
313*/
2c2d71f5
JH
314
315typedef struct scan_data_t {
1de06328
YO
316 /*I32 len_min; unused */
317 /*I32 len_delta; unused */
2c2d71f5
JH
318 I32 pos_min;
319 I32 pos_delta;
320 SV *last_found;
1de06328 321 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
322 I32 last_start_min;
323 I32 last_start_max;
1de06328
YO
324 SV **longest; /* Either &l_fixed, or &l_float. */
325 SV *longest_fixed; /* longest fixed string found in pattern */
326 I32 offset_fixed; /* offset where it starts */
486ec47a 327 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
328 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
329 SV *longest_float; /* longest floating string found in pattern */
330 I32 offset_float_min; /* earliest point in string it can appear */
331 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 332 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 333 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
334 I32 flags;
335 I32 whilem_c;
cb434fcc 336 I32 *last_closep;
653099ff 337 struct regnode_charclass_class *start_class;
2c2d71f5
JH
338} scan_data_t;
339
a687059c 340/*
e50aee73 341 * Forward declarations for pregcomp()'s friends.
a687059c 342 */
a0d0e21e 343
27da23d5 344static const scan_data_t zero_scan_data =
1de06328 345 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
346
347#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
348#define SF_BEFORE_SEOL 0x0001
349#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
350#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
351#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
352
09b7f37c
CB
353#ifdef NO_UNARY_PLUS
354# define SF_FIX_SHIFT_EOL (0+2)
355# define SF_FL_SHIFT_EOL (0+4)
356#else
357# define SF_FIX_SHIFT_EOL (+2)
358# define SF_FL_SHIFT_EOL (+4)
359#endif
c277df42
IZ
360
361#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
362#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
363
364#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
365#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
366#define SF_IS_INF 0x0040
367#define SF_HAS_PAR 0x0080
368#define SF_IN_PAR 0x0100
369#define SF_HAS_EVAL 0x0200
370#define SCF_DO_SUBSTR 0x0400
653099ff
GS
371#define SCF_DO_STCLASS_AND 0x0800
372#define SCF_DO_STCLASS_OR 0x1000
373#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 374#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 375
786e8c11 376#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 377#define SCF_SEEN_ACCEPT 0x8000
07be1b83 378
43fead97 379#define UTF cBOOL(RExC_utf8)
a62b1201
KW
380#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
381#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
382#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
383#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
384#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
385#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
386#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 387
43fead97 388#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 389
ffc61ed2 390#define OOB_UNICODE 12345678
93733859 391#define OOB_NAMEDCLASS -1
b8c5462f 392
a0ed51b3
LW
393#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
394#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
395
8615cb43 396
b45f050a
JF
397/* length of regex to show in messages that don't mark a position within */
398#define RegexLengthToShowInErrorMessages 127
399
400/*
401 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
402 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
403 * op/pragma/warn/regcomp.
404 */
7253e4e3
RK
405#define MARKER1 "<-- HERE" /* marker as it appears in the description */
406#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 407
7253e4e3 408#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
409
410/*
411 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
412 * arg. Show regex, up to a maximum length. If it's too long, chop and add
413 * "...".
414 */
58e23c8d 415#define _FAIL(code) STMT_START { \
bfed75c6 416 const char *ellipses = ""; \
ccb2c380
MP
417 IV len = RExC_end - RExC_precomp; \
418 \
419 if (!SIZE_ONLY) \
288b8c02 420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
421 if (len > RegexLengthToShowInErrorMessages) { \
422 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
423 len = RegexLengthToShowInErrorMessages - 10; \
424 ellipses = "..."; \
425 } \
58e23c8d 426 code; \
ccb2c380 427} STMT_END
8615cb43 428
58e23c8d
YO
429#define FAIL(msg) _FAIL( \
430 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
431 msg, (int)len, RExC_precomp, ellipses))
432
433#define FAIL2(msg,arg) _FAIL( \
434 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
435 arg, (int)len, RExC_precomp, ellipses))
436
b45f050a 437/*
b45f050a
JF
438 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
439 */
ccb2c380 440#define Simple_vFAIL(m) STMT_START { \
a28509cc 441 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
442 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
443 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
b45f050a
JF
445
446/*
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
448 */
ccb2c380
MP
449#define vFAIL(m) STMT_START { \
450 if (!SIZE_ONLY) \
288b8c02 451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
452 Simple_vFAIL(m); \
453} STMT_END
b45f050a
JF
454
455/*
456 * Like Simple_vFAIL(), but accepts two arguments.
457 */
ccb2c380 458#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 459 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
b45f050a
JF
463
464/*
465 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
466 */
ccb2c380
MP
467#define vFAIL2(m,a1) STMT_START { \
468 if (!SIZE_ONLY) \
288b8c02 469 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
470 Simple_vFAIL2(m, a1); \
471} STMT_END
b45f050a
JF
472
473
474/*
475 * Like Simple_vFAIL(), but accepts three arguments.
476 */
ccb2c380 477#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 478 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
479 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
480 (int)offset, RExC_precomp, RExC_precomp + offset); \
481} STMT_END
b45f050a
JF
482
483/*
484 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
485 */
ccb2c380
MP
486#define vFAIL3(m,a1,a2) STMT_START { \
487 if (!SIZE_ONLY) \
288b8c02 488 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
489 Simple_vFAIL3(m, a1, a2); \
490} STMT_END
b45f050a
JF
491
492/*
493 * Like Simple_vFAIL(), but accepts four arguments.
494 */
ccb2c380 495#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 496 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
497 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
498 (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
b45f050a 500
668c081a 501#define ckWARNreg(loc,m) STMT_START { \
a28509cc 502 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
505} STMT_END
506
668c081a 507#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
d1d15184 509 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
510 m REPORT_LOCATION, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
512} STMT_END
513
2335b3d3
KW
514#define ckWARN2regdep(loc,m, a1) STMT_START { \
515 const IV offset = loc - RExC_precomp; \
516 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
517 m REPORT_LOCATION, \
518 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
519} STMT_END
520
668c081a 521#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 522 const IV offset = loc - RExC_precomp; \
668c081a 523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
524 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
525} STMT_END
526
527#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 528 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
529 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
530 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
531} STMT_END
532
668c081a
NC
533#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
537} STMT_END
538
ccb2c380 539#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 540 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
542 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
543} STMT_END
544
668c081a
NC
545#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
549} STMT_END
550
ccb2c380 551#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 552 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
553 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
554 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
555} STMT_END
9d1d55b5 556
8615cb43 557
cd439c50 558/* Allow for side effects in s */
ccb2c380
MP
559#define REGC(c,s) STMT_START { \
560 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
561} STMT_END
cd439c50 562
fac92740
MJD
563/* Macros for recording node offsets. 20001227 mjd@plover.com
564 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
565 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
566 * Element 0 holds the number n.
07be1b83 567 * Position is 1 indexed.
fac92740 568 */
7122b237
YO
569#ifndef RE_TRACK_PATTERN_OFFSETS
570#define Set_Node_Offset_To_R(node,byte)
571#define Set_Node_Offset(node,byte)
572#define Set_Cur_Node_Offset
573#define Set_Node_Length_To_R(node,len)
574#define Set_Node_Length(node,len)
575#define Set_Node_Cur_Length(node)
576#define Node_Offset(n)
577#define Node_Length(n)
578#define Set_Node_Offset_Length(node,offset,len)
579#define ProgLen(ri) ri->u.proglen
580#define SetProgLen(ri,x) ri->u.proglen = x
581#else
582#define ProgLen(ri) ri->u.offsets[0]
583#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
584#define Set_Node_Offset_To_R(node,byte) STMT_START { \
585 if (! SIZE_ONLY) { \
586 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 587 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 588 if((node) < 0) { \
551405c4 589 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
590 } else { \
591 RExC_offsets[2*(node)-1] = (byte); \
592 } \
593 } \
594} STMT_END
595
596#define Set_Node_Offset(node,byte) \
597 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
598#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
599
600#define Set_Node_Length_To_R(node,len) STMT_START { \
601 if (! SIZE_ONLY) { \
602 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 603 __LINE__, (int)(node), (int)(len))); \
ccb2c380 604 if((node) < 0) { \
551405c4 605 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
606 } else { \
607 RExC_offsets[2*(node)] = (len); \
608 } \
609 } \
610} STMT_END
611
612#define Set_Node_Length(node,len) \
613 Set_Node_Length_To_R((node)-RExC_emit_start, len)
614#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
615#define Set_Node_Cur_Length(node) \
616 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
617
618/* Get offsets and lengths */
619#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
620#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
621
07be1b83
YO
622#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
623 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
624 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
625} STMT_END
7122b237 626#endif
07be1b83
YO
627
628#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
629#define EXPERIMENTAL_INPLACESCAN
f427392e 630#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 631
304ee84b
YO
632#define DEBUG_STUDYDATA(str,data,depth) \
633DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 634 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
635 "%*s" str "Pos:%"IVdf"/%"IVdf \
636 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
637 (int)(depth)*2, "", \
638 (IV)((data)->pos_min), \
639 (IV)((data)->pos_delta), \
304ee84b 640 (UV)((data)->flags), \
1de06328 641 (IV)((data)->whilem_c), \
304ee84b
YO
642 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
643 is_inf ? "INF " : "" \
1de06328
YO
644 ); \
645 if ((data)->last_found) \
646 PerlIO_printf(Perl_debug_log, \
647 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
648 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
649 SvPVX_const((data)->last_found), \
650 (IV)((data)->last_end), \
651 (IV)((data)->last_start_min), \
652 (IV)((data)->last_start_max), \
653 ((data)->longest && \
654 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
655 SvPVX_const((data)->longest_fixed), \
656 (IV)((data)->offset_fixed), \
657 ((data)->longest && \
658 (data)->longest==&((data)->longest_float)) ? "*" : "", \
659 SvPVX_const((data)->longest_float), \
660 (IV)((data)->offset_float_min), \
661 (IV)((data)->offset_float_max) \
662 ); \
663 PerlIO_printf(Perl_debug_log,"\n"); \
664});
665
acfe0abc 666static void clear_re(pTHX_ void *r);
4327152a 667
653099ff 668/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 669 Update the longest found anchored substring and the longest found
653099ff
GS
670 floating substrings if needed. */
671
4327152a 672STATIC void
304ee84b 673S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 674{
e1ec3a88
AL
675 const STRLEN l = CHR_SVLEN(data->last_found);
676 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 677 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 678
7918f24d
NC
679 PERL_ARGS_ASSERT_SCAN_COMMIT;
680
c277df42 681 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 682 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
683 if (*data->longest == data->longest_fixed) {
684 data->offset_fixed = l ? data->last_start_min : data->pos_min;
685 if (data->flags & SF_BEFORE_EOL)
b81d288d 686 data->flags
c277df42
IZ
687 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
688 else
689 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
690 data->minlen_fixed=minlenp;
691 data->lookbehind_fixed=0;
a0ed51b3 692 }
304ee84b 693 else { /* *data->longest == data->longest_float */
c277df42 694 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
695 data->offset_float_max = (l
696 ? data->last_start_max
c277df42 697 : data->pos_min + data->pos_delta);
304ee84b 698 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 699 data->offset_float_max = I32_MAX;
c277df42 700 if (data->flags & SF_BEFORE_EOL)
b81d288d 701 data->flags
c277df42
IZ
702 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
703 else
704 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
705 data->minlen_float=minlenp;
706 data->lookbehind_float=0;
c277df42
IZ
707 }
708 }
709 SvCUR_set(data->last_found, 0);
0eda9292 710 {
a28509cc 711 SV * const sv = data->last_found;
097eb12c
AL
712 if (SvUTF8(sv) && SvMAGICAL(sv)) {
713 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
714 if (mg)
715 mg->mg_len = 0;
716 }
0eda9292 717 }
c277df42
IZ
718 data->last_end = -1;
719 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 720 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
721}
722
653099ff
GS
723/* Can match anything (initialization) */
724STATIC void
3fffb88a 725S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 726{
7918f24d
NC
727 PERL_ARGS_ASSERT_CL_ANYTHING;
728
f8bef550 729 ANYOF_BITMAP_SETALL(cl);
6b1ea9dd 730 ANYOF_CLASS_ZERO(cl); /* all bits set, so class is irrelevant */
3fffb88a
KW
731 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
732
733 /* If any portion of the regex is to operate under locale rules,
734 * initialization includes it. The reason this isn't done for all regexes
735 * is that the optimizer was written under the assumption that locale was
736 * all-or-nothing. Given the complexity and lack of documentation in the
737 * optimizer, and that there are inadequate test cases for locale, so many
738 * parts of it may not work properly, it is safest to avoid locale unless
739 * necessary. */
740 if (RExC_contains_locale) {
741 cl->flags |= ANYOF_LOCALE;
742 }
653099ff
GS
743}
744
745/* Can match anything (initialization) */
746STATIC int
5f66b61c 747S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
748{
749 int value;
750
7918f24d
NC
751 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
752
aaa51d5e 753 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
754 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
755 return 1;
1aa99e6b
IH
756 if (!(cl->flags & ANYOF_UNICODE_ALL))
757 return 0;
10edeb5d 758 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 759 return 0;
653099ff
GS
760 return 1;
761}
762
763/* Can match anything (initialization) */
764STATIC void
e755fd73 765S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 766{
7918f24d
NC
767 PERL_ARGS_ASSERT_CL_INIT;
768
8ecf7187 769 Zero(cl, 1, struct regnode_charclass_class);
653099ff 770 cl->type = ANYOF;
3fffb88a 771 cl_anything(pRExC_state, cl);
1411dba4 772 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
653099ff
GS
773}
774
1051e1c4
KW
775/* These two functions currently do the exact same thing */
776#define cl_init_zero S_cl_init
653099ff
GS
777
778/* 'And' a given class with another one. Can create false positives */
58b5ba03 779/* cl should not be inverted */
653099ff 780STATIC void
5f66b61c 781S_cl_and(struct regnode_charclass_class *cl,
a28509cc 782 const struct regnode_charclass_class *and_with)
653099ff 783{
7918f24d 784 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
785
786 assert(and_with->type == ANYOF);
1e6ade67 787
c6b76537 788 /* I (khw) am not sure all these restrictions are necessary XXX */
1e6ade67
KW
789 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
790 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 791 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
792 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
793 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
794 int i;
795
796 if (and_with->flags & ANYOF_INVERT)
797 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
798 cl->bitmap[i] &= ~and_with->bitmap[i];
799 else
800 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
801 cl->bitmap[i] &= and_with->bitmap[i];
802 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 803
c6b76537 804 if (and_with->flags & ANYOF_INVERT) {
8951c461 805
c6b76537
KW
806 /* Here, the and'ed node is inverted. Get the AND of the flags that
807 * aren't affected by the inversion. Those that are affected are
808 * handled individually below */
809 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
810 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
811 cl->flags |= affected_flags;
812
813 /* We currently don't know how to deal with things that aren't in the
814 * bitmap, but we know that the intersection is no greater than what
815 * is already in cl, so let there be false positives that get sorted
816 * out after the synthetic start class succeeds, and the node is
817 * matched for real. */
818
819 /* The inversion of these two flags indicate that the resulting
820 * intersection doesn't have them */
821 if (and_with->flags & ANYOF_UNICODE_ALL) {
4713bfe1
KW
822 cl->flags &= ~ANYOF_UNICODE_ALL;
823 }
c6b76537
KW
824 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
825 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 826 }
1aa99e6b 827 }
c6b76537 828 else { /* and'd node is not inverted */
137165a6 829 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
830
831 /* Here 'and_with' doesn't match anything outside the bitmap
832 * (except possibly ANYOF_UNICODE_ALL), which means the
833 * intersection can't either, except for ANYOF_UNICODE_ALL, in
834 * which case we don't know what the intersection is, but it's no
835 * greater than what cl already has, so can just leave it alone,
836 * with possible false positives */
837 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
838 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
871d0d1a 839 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 840 }
137165a6 841 }
c6b76537
KW
842 else if (! ANYOF_NONBITMAP(cl)) {
843
844 /* Here, 'and_with' does match something outside the bitmap, and cl
845 * doesn't have a list of things to match outside the bitmap. If
846 * cl can match all code points above 255, the intersection will
847 * be those above-255 code points that 'and_with' matches. There
848 * may be false positives from code points in 'and_with' that are
849 * outside the bitmap but below 256, but those get sorted out
850 * after the synthetic start class succeeds). If cl can't match
851 * all Unicode code points, it means here that it can't match *
852 * anything outside the bitmap, so we leave the bitmap empty */
853 if (cl->flags & ANYOF_UNICODE_ALL) {
854 ARG_SET(cl, ARG(and_with));
855 }
856 }
857 else {
858 /* Here, both 'and_with' and cl match something outside the
859 * bitmap. Currently we do not do the intersection, so just match
860 * whatever cl had at the beginning. */
861 }
862
863
864 /* Take the intersection of the two sets of flags */
865 cl->flags &= and_with->flags;
137165a6 866 }
653099ff
GS
867}
868
869/* 'OR' a given class with another one. Can create false positives */
58b5ba03 870/* cl should not be inverted */
653099ff 871STATIC void
3fffb88a 872S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 873{
7918f24d
NC
874 PERL_ARGS_ASSERT_CL_OR;
875
653099ff 876 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
877
878 /* Here, the or'd node is to be inverted. This means we take the
879 * complement of everything not in the bitmap, but currently we don't
880 * know what that is, so give up and match anything */
881 if (ANYOF_NONBITMAP(or_with)) {
3fffb88a 882 cl_anything(pRExC_state, cl);
c6b76537 883 }
653099ff
GS
884 /* We do not use
885 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
886 * <= (B1 | !B2) | (CL1 | !CL2)
887 * which is wasteful if CL2 is small, but we ignore CL2:
888 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
889 * XXXX Can we handle case-fold? Unclear:
890 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
891 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
892 */
c6b76537 893 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
894 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
895 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
896 int i;
897
898 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
899 cl->bitmap[i] |= ~or_with->bitmap[i];
900 } /* XXXX: logic is complicated otherwise */
901 else {
3fffb88a 902 cl_anything(pRExC_state, cl);
653099ff 903 }
c6b76537
KW
904
905 /* And, we can just take the union of the flags that aren't affected
906 * by the inversion */
907 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
908
909 /* For the remaining flags:
910 ANYOF_UNICODE_ALL and inverted means to not match anything above
911 255, which means that the union with cl should just be
912 what cl has in it, so can ignore this flag
913 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
914 is 127-255 to match them, but then invert that, so the
915 union with cl should just be what cl has in it, so can
916 ignore this flag
917 */
918 } else { /* 'or_with' is not inverted */
653099ff
GS
919 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
920 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
921 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
922 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
923 int i;
924
925 /* OR char bitmap and class bitmap separately */
926 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
927 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 928 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
929 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
930 cl->classflags[i] |= or_with->classflags[i];
931 cl->flags |= ANYOF_CLASS;
932 }
933 }
934 else { /* XXXX: logic is complicated, leave it along for a moment. */
3fffb88a 935 cl_anything(pRExC_state, cl);
653099ff 936 }
9826f543 937
c6b76537
KW
938 /* Take the union */
939 cl->flags |= or_with->flags;
940
941 if (ANYOF_NONBITMAP(or_with)) {
942
943 /* Use the added node's outside-the-bit-map match if there isn't a
944 * conflict. If there is a conflict (both nodes match something
945 * outside the bitmap, but what they match outside is not the same
946 * pointer, and hence not easily compared until XXX we extend
947 * inversion lists this far), give up and allow the start class to
948 * match everything outside the bitmap */
949 if (! ANYOF_NONBITMAP(cl)) {
950 ARG_SET(cl, ARG(or_with));
951 }
952 else if (ARG(cl) != ARG(or_with)) {
953 cl->flags |= ANYOF_UNICODE_ALL;
954 }
955 }
1aa99e6b 956 }
653099ff
GS
957}
958
a3621e74
YO
959#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
960#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
961#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
962#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
963
3dab1dad
YO
964
965#ifdef DEBUGGING
07be1b83 966/*
2b8b4781
NC
967 dump_trie(trie,widecharmap,revcharmap)
968 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
969 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
970
971 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
972 The _interim_ variants are used for debugging the interim
973 tables that are used to generate the final compressed
974 representation which is what dump_trie expects.
975
486ec47a 976 Part of the reason for their existence is to provide a form
3dab1dad 977 of documentation as to how the different representations function.
07be1b83
YO
978
979*/
3dab1dad
YO
980
981/*
3dab1dad
YO
982 Dumps the final compressed table form of the trie to Perl_debug_log.
983 Used for debugging make_trie().
984*/
b9a59e08 985
3dab1dad 986STATIC void
2b8b4781
NC
987S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
988 AV *revcharmap, U32 depth)
3dab1dad
YO
989{
990 U32 state;
ab3bbdeb 991 SV *sv=sv_newmortal();
55eed653 992 int colwidth= widecharmap ? 6 : 4;
2e64971a 993 U16 word;
3dab1dad
YO
994 GET_RE_DEBUG_FLAGS_DECL;
995
7918f24d 996 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 997
3dab1dad
YO
998 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
999 (int)depth * 2 + 2,"",
1000 "Match","Base","Ofs" );
1001
1002 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1003 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1004 if ( tmp ) {
ab3bbdeb
YO
1005 PerlIO_printf( Perl_debug_log, "%*s",
1006 colwidth,
ddc5bc0f 1007 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1008 PL_colors[0], PL_colors[1],
1009 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1010 PERL_PV_ESCAPE_FIRSTCHAR
1011 )
1012 );
3dab1dad
YO
1013 }
1014 }
1015 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1016 (int)depth * 2 + 2,"");
1017
1018 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1019 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1020 PerlIO_printf( Perl_debug_log, "\n");
1021
1e2e3d02 1022 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1023 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1024
1025 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1026
1027 if ( trie->states[ state ].wordnum ) {
1028 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1029 } else {
1030 PerlIO_printf( Perl_debug_log, "%6s", "" );
1031 }
1032
1033 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1034
1035 if ( base ) {
1036 U32 ofs = 0;
1037
1038 while( ( base + ofs < trie->uniquecharcount ) ||
1039 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1040 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1041 ofs++;
1042
1043 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1044
1045 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1046 if ( ( base + ofs >= trie->uniquecharcount ) &&
1047 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1048 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1049 {
ab3bbdeb
YO
1050 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1051 colwidth,
3dab1dad
YO
1052 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1053 } else {
ab3bbdeb 1054 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1055 }
1056 }
1057
1058 PerlIO_printf( Perl_debug_log, "]");
1059
1060 }
1061 PerlIO_printf( Perl_debug_log, "\n" );
1062 }
2e64971a
DM
1063 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1064 for (word=1; word <= trie->wordcount; word++) {
1065 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1066 (int)word, (int)(trie->wordinfo[word].prev),
1067 (int)(trie->wordinfo[word].len));
1068 }
1069 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1070}
1071/*
3dab1dad
YO
1072 Dumps a fully constructed but uncompressed trie in list form.
1073 List tries normally only are used for construction when the number of
1074 possible chars (trie->uniquecharcount) is very high.
1075 Used for debugging make_trie().
1076*/
1077STATIC void
55eed653 1078S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1079 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1080 U32 depth)
3dab1dad
YO
1081{
1082 U32 state;
ab3bbdeb 1083 SV *sv=sv_newmortal();
55eed653 1084 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1085 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1086
1087 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1088
3dab1dad 1089 /* print out the table precompression. */
ab3bbdeb
YO
1090 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1091 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1092 "------:-----+-----------------\n" );
3dab1dad
YO
1093
1094 for( state=1 ; state < next_alloc ; state ++ ) {
1095 U16 charid;
1096
ab3bbdeb 1097 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1098 (int)depth * 2 + 2,"", (UV)state );
1099 if ( ! trie->states[ state ].wordnum ) {
1100 PerlIO_printf( Perl_debug_log, "%5s| ","");
1101 } else {
1102 PerlIO_printf( Perl_debug_log, "W%4x| ",
1103 trie->states[ state ].wordnum
1104 );
1105 }
1106 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1107 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1108 if ( tmp ) {
1109 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1110 colwidth,
ddc5bc0f 1111 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1112 PL_colors[0], PL_colors[1],
1113 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1114 PERL_PV_ESCAPE_FIRSTCHAR
1115 ) ,
1e2e3d02
YO
1116 TRIE_LIST_ITEM(state,charid).forid,
1117 (UV)TRIE_LIST_ITEM(state,charid).newstate
1118 );
1119 if (!(charid % 10))
664e119d
RGS
1120 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1121 (int)((depth * 2) + 14), "");
1e2e3d02 1122 }
ab3bbdeb
YO
1123 }
1124 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1125 }
1126}
1127
1128/*
3dab1dad
YO
1129 Dumps a fully constructed but uncompressed trie in table form.
1130 This is the normal DFA style state transition table, with a few
1131 twists to facilitate compression later.
1132 Used for debugging make_trie().
1133*/
1134STATIC void
55eed653 1135S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1136 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1137 U32 depth)
3dab1dad
YO
1138{
1139 U32 state;
1140 U16 charid;
ab3bbdeb 1141 SV *sv=sv_newmortal();
55eed653 1142 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1143 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1144
1145 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1146
1147 /*
1148 print out the table precompression so that we can do a visual check
1149 that they are identical.
1150 */
1151
1152 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1153
1154 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1155 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1156 if ( tmp ) {
ab3bbdeb
YO
1157 PerlIO_printf( Perl_debug_log, "%*s",
1158 colwidth,
ddc5bc0f 1159 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1160 PL_colors[0], PL_colors[1],
1161 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1162 PERL_PV_ESCAPE_FIRSTCHAR
1163 )
1164 );
3dab1dad
YO
1165 }
1166 }
1167
1168 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1169
1170 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1171 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1172 }
1173
1174 PerlIO_printf( Perl_debug_log, "\n" );
1175
1176 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1177
1178 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1179 (int)depth * 2 + 2,"",
1180 (UV)TRIE_NODENUM( state ) );
1181
1182 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1183 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1184 if (v)
1185 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1186 else
1187 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1188 }
1189 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1190 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1191 } else {
1192 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1193 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1194 }
1195 }
07be1b83 1196}
3dab1dad
YO
1197
1198#endif
1199
2e64971a 1200
786e8c11
YO
1201/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1202 startbranch: the first branch in the whole branch sequence
1203 first : start branch of sequence of branch-exact nodes.
1204 May be the same as startbranch
1205 last : Thing following the last branch.
1206 May be the same as tail.
1207 tail : item following the branch sequence
1208 count : words in the sequence
1209 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1210 depth : indent depth
3dab1dad 1211
786e8c11 1212Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1213
786e8c11
YO
1214A trie is an N'ary tree where the branches are determined by digital
1215decomposition of the key. IE, at the root node you look up the 1st character and
1216follow that branch repeat until you find the end of the branches. Nodes can be
1217marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1218
786e8c11 1219 /he|she|his|hers/
72f13be8 1220
786e8c11
YO
1221would convert into the following structure. Numbers represent states, letters
1222following numbers represent valid transitions on the letter from that state, if
1223the number is in square brackets it represents an accepting state, otherwise it
1224will be in parenthesis.
07be1b83 1225
786e8c11
YO
1226 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1227 | |
1228 | (2)
1229 | |
1230 (1) +-i->(6)-+-s->[7]
1231 |
1232 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1233
786e8c11
YO
1234 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1235
1236This shows that when matching against the string 'hers' we will begin at state 1
1237read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1238then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1239is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1240single traverse. We store a mapping from accepting to state to which word was
1241matched, and then when we have multiple possibilities we try to complete the
1242rest of the regex in the order in which they occured in the alternation.
1243
1244The only prior NFA like behaviour that would be changed by the TRIE support is
1245the silent ignoring of duplicate alternations which are of the form:
1246
1247 / (DUPE|DUPE) X? (?{ ... }) Y /x
1248
4b714af6 1249Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1250and without the optimisation. With the optimisations dupes will be silently
486ec47a 1251ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1252the following demonstrates:
1253
1254 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1255
1256which prints out 'word' three times, but
1257
1258 'words'=~/(word|word|word)(?{ print $1 })S/
1259
1260which doesnt print it out at all. This is due to other optimisations kicking in.
1261
1262Example of what happens on a structural level:
1263
486ec47a 1264The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1265
1266 1: CURLYM[1] {1,32767}(18)
1267 5: BRANCH(8)
1268 6: EXACT <ac>(16)
1269 8: BRANCH(11)
1270 9: EXACT <ad>(16)
1271 11: BRANCH(14)
1272 12: EXACT <ab>(16)
1273 16: SUCCEED(0)
1274 17: NOTHING(18)
1275 18: END(0)
1276
1277This would be optimizable with startbranch=5, first=5, last=16, tail=16
1278and should turn into:
1279
1280 1: CURLYM[1] {1,32767}(18)
1281 5: TRIE(16)
1282 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1283 <ac>
1284 <ad>
1285 <ab>
1286 16: SUCCEED(0)
1287 17: NOTHING(18)
1288 18: END(0)
1289
1290Cases where tail != last would be like /(?foo|bar)baz/:
1291
1292 1: BRANCH(4)
1293 2: EXACT <foo>(8)
1294 4: BRANCH(7)
1295 5: EXACT <bar>(8)
1296 7: TAIL(8)
1297 8: EXACT <baz>(10)
1298 10: END(0)
1299
1300which would be optimizable with startbranch=1, first=1, last=7, tail=8
1301and would end up looking like:
1302
1303 1: TRIE(8)
1304 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1305 <foo>
1306 <bar>
1307 7: TAIL(8)
1308 8: EXACT <baz>(10)
1309 10: END(0)
1310
1311 d = uvuni_to_utf8_flags(d, uv, 0);
1312
1313is the recommended Unicode-aware way of saying
1314
1315 *(d++) = uv;
1316*/
1317
1e2e3d02 1318#define TRIE_STORE_REVCHAR \
786e8c11 1319 STMT_START { \
73031816
NC
1320 if (UTF) { \
1321 SV *zlopp = newSV(2); \
88c9ea1e
CB
1322 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1323 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1324 SvCUR_set(zlopp, kapow - flrbbbbb); \
1325 SvPOK_on(zlopp); \
1326 SvUTF8_on(zlopp); \
1327 av_push(revcharmap, zlopp); \
1328 } else { \
6bdeddd2 1329 char ooooff = (char)uvc; \
73031816
NC
1330 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1331 } \
1332 } STMT_END
786e8c11
YO
1333
1334#define TRIE_READ_CHAR STMT_START { \
1335 wordlen++; \
1336 if ( UTF ) { \
1337 if ( folder ) { \
1338 if ( foldlen > 0 ) { \
1339 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1340 foldlen -= len; \
1341 scan += len; \
1342 len = 0; \
1343 } else { \
1344 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1345 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1346 foldlen -= UNISKIP( uvc ); \
1347 scan = foldbuf + UNISKIP( uvc ); \
1348 } \
1349 } else { \
1350 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1351 } \
1352 } else { \
1353 uvc = (U32)*uc; \
1354 len = 1; \
1355 } \
1356} STMT_END
1357
1358
1359
1360#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1361 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1362 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1363 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1364 } \
1365 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1366 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1367 TRIE_LIST_CUR( state )++; \
1368} STMT_END
07be1b83 1369
786e8c11
YO
1370#define TRIE_LIST_NEW(state) STMT_START { \
1371 Newxz( trie->states[ state ].trans.list, \
1372 4, reg_trie_trans_le ); \
1373 TRIE_LIST_CUR( state ) = 1; \
1374 TRIE_LIST_LEN( state ) = 4; \
1375} STMT_END
07be1b83 1376
786e8c11
YO
1377#define TRIE_HANDLE_WORD(state) STMT_START { \
1378 U16 dupe= trie->states[ state ].wordnum; \
1379 regnode * const noper_next = regnext( noper ); \
1380 \
786e8c11
YO
1381 DEBUG_r({ \
1382 /* store the word for dumping */ \
1383 SV* tmp; \
1384 if (OP(noper) != NOTHING) \
740cce10 1385 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1386 else \
740cce10 1387 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1388 av_push( trie_words, tmp ); \
786e8c11
YO
1389 }); \
1390 \
1391 curword++; \
2e64971a
DM
1392 trie->wordinfo[curword].prev = 0; \
1393 trie->wordinfo[curword].len = wordlen; \
1394 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1395 \
1396 if ( noper_next < tail ) { \
1397 if (!trie->jump) \
c944940b 1398 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1399 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1400 if (!jumper) \
1401 jumper = noper_next; \
1402 if (!nextbranch) \
1403 nextbranch= regnext(cur); \
1404 } \
1405 \
1406 if ( dupe ) { \
2e64971a
DM
1407 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1408 /* chain, so that when the bits of chain are later */\
1409 /* linked together, the dups appear in the chain */\
1410 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1411 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1412 } else { \
1413 /* we haven't inserted this word yet. */ \
1414 trie->states[ state ].wordnum = curword; \
1415 } \
1416} STMT_END
07be1b83 1417
3dab1dad 1418
786e8c11
YO
1419#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1420 ( ( base + charid >= ucharcount \
1421 && base + charid < ubound \
1422 && state == trie->trans[ base - ucharcount + charid ].check \
1423 && trie->trans[ base - ucharcount + charid ].next ) \
1424 ? trie->trans[ base - ucharcount + charid ].next \
1425 : ( state==1 ? special : 0 ) \
1426 )
3dab1dad 1427
786e8c11
YO
1428#define MADE_TRIE 1
1429#define MADE_JUMP_TRIE 2
1430#define MADE_EXACT_TRIE 4
3dab1dad 1431
a3621e74 1432STATIC I32
786e8c11 1433S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1434{
27da23d5 1435 dVAR;
a3621e74
YO
1436 /* first pass, loop through and scan words */
1437 reg_trie_data *trie;
55eed653 1438 HV *widecharmap = NULL;
2b8b4781 1439 AV *revcharmap = newAV();
a3621e74 1440 regnode *cur;
9f7f3913 1441 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1442 STRLEN len = 0;
1443 UV uvc = 0;
1444 U16 curword = 0;
1445 U32 next_alloc = 0;
786e8c11
YO
1446 regnode *jumper = NULL;
1447 regnode *nextbranch = NULL;
7f69552c 1448 regnode *convert = NULL;
2e64971a 1449 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1450 /* we just use folder as a flag in utf8 */
1e696034 1451 const U8 * folder = NULL;
a3621e74 1452
2b8b4781
NC
1453#ifdef DEBUGGING
1454 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1455 AV *trie_words = NULL;
1456 /* along with revcharmap, this only used during construction but both are
1457 * useful during debugging so we store them in the struct when debugging.
8e11feef 1458 */
2b8b4781
NC
1459#else
1460 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1461 STRLEN trie_charcount=0;
3dab1dad 1462#endif
2b8b4781 1463 SV *re_trie_maxbuff;
a3621e74 1464 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1465
1466 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1467#ifndef DEBUGGING
1468 PERL_UNUSED_ARG(depth);
1469#endif
a3621e74 1470
1e696034 1471 switch (flags) {
2f7f8cb1 1472 case EXACTFA:
1e696034
KW
1473 case EXACTFU: folder = PL_fold_latin1; break;
1474 case EXACTF: folder = PL_fold; break;
1475 case EXACTFL: folder = PL_fold_locale; break;
1476 }
1477
c944940b 1478 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1479 trie->refcount = 1;
3dab1dad 1480 trie->startstate = 1;
786e8c11 1481 trie->wordcount = word_count;
f8fc2ecf 1482 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1483 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1484 if (!(UTF && folder))
c944940b 1485 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1486 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1487 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1488
a3621e74 1489 DEBUG_r({
2b8b4781 1490 trie_words = newAV();
a3621e74 1491 });
a3621e74 1492
0111c4fd 1493 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1494 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1495 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1496 }
3dab1dad
YO
1497 DEBUG_OPTIMISE_r({
1498 PerlIO_printf( Perl_debug_log,
786e8c11 1499 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1500 (int)depth * 2 + 2, "",
1501 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1502 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1503 (int)depth);
3dab1dad 1504 });
7f69552c
YO
1505
1506 /* Find the node we are going to overwrite */
1507 if ( first == startbranch && OP( last ) != BRANCH ) {
1508 /* whole branch chain */
1509 convert = first;
1510 } else {
1511 /* branch sub-chain */
1512 convert = NEXTOPER( first );
1513 }
1514
a3621e74
YO
1515 /* -- First loop and Setup --
1516
1517 We first traverse the branches and scan each word to determine if it
1518 contains widechars, and how many unique chars there are, this is
1519 important as we have to build a table with at least as many columns as we
1520 have unique chars.
1521
1522 We use an array of integers to represent the character codes 0..255
38a44b82 1523 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1524 native representation of the character value as the key and IV's for the
1525 coded index.
1526
1527 *TODO* If we keep track of how many times each character is used we can
1528 remap the columns so that the table compression later on is more
3b753521 1529 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1530 middle and the least common are on the outside. IMO this would be better
1531 than a most to least common mapping as theres a decent chance the most
1532 common letter will share a node with the least common, meaning the node
486ec47a 1533 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1534 case is when we have the least common nodes twice.
1535
1536 */
1537
a3621e74 1538 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1539 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1540 const U8 *uc = (U8*)STRING( noper );
a28509cc 1541 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1542 STRLEN foldlen = 0;
1543 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1544 const U8 *scan = (U8*)NULL;
07be1b83 1545 U32 wordlen = 0; /* required init */
02daf0ab
YO
1546 STRLEN chars = 0;
1547 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1548
3dab1dad
YO
1549 if (OP(noper) == NOTHING) {
1550 trie->minlen= 0;
1551 continue;
1552 }
02daf0ab
YO
1553 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1554 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1555 regardless of encoding */
1556
a3621e74 1557 for ( ; uc < e ; uc += len ) {
3dab1dad 1558 TRIE_CHARCOUNT(trie)++;
a3621e74 1559 TRIE_READ_CHAR;
3dab1dad 1560 chars++;
a3621e74
YO
1561 if ( uvc < 256 ) {
1562 if ( !trie->charmap[ uvc ] ) {
1563 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1564 if ( folder )
1565 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1566 TRIE_STORE_REVCHAR;
a3621e74 1567 }
02daf0ab 1568 if ( set_bit ) {
62012aee
KW
1569 /* store the codepoint in the bitmap, and its folded
1570 * equivalent. */
02daf0ab 1571 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1572
1573 /* store the folded codepoint */
1574 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1575
1576 if ( !UTF ) {
1577 /* store first byte of utf8 representation of
acdf4139
KW
1578 variant codepoints */
1579 if (! UNI_IS_INVARIANT(uvc)) {
1580 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1581 }
1582 }
02daf0ab
YO
1583 set_bit = 0; /* We've done our bit :-) */
1584 }
a3621e74
YO
1585 } else {
1586 SV** svpp;
55eed653
NC
1587 if ( !widecharmap )
1588 widecharmap = newHV();
a3621e74 1589
55eed653 1590 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1591
1592 if ( !svpp )
e4584336 1593 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1594
1595 if ( !SvTRUE( *svpp ) ) {
1596 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1597 TRIE_STORE_REVCHAR;
a3621e74
YO
1598 }
1599 }
1600 }
3dab1dad
YO
1601 if( cur == first ) {
1602 trie->minlen=chars;
1603 trie->maxlen=chars;
1604 } else if (chars < trie->minlen) {
1605 trie->minlen=chars;
1606 } else if (chars > trie->maxlen) {
1607 trie->maxlen=chars;
1608 }
1609
a3621e74
YO
1610 } /* end first pass */
1611 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1612 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1613 (int)depth * 2 + 2,"",
55eed653 1614 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1615 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1616 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1617 );
a3621e74
YO
1618
1619 /*
1620 We now know what we are dealing with in terms of unique chars and
1621 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1622 representation using a flat table will take. If it's over a reasonable
1623 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1624 conservative but potentially much slower representation using an array
1625 of lists.
1626
1627 At the end we convert both representations into the same compressed
1628 form that will be used in regexec.c for matching with. The latter
1629 is a form that cannot be used to construct with but has memory
1630 properties similar to the list form and access properties similar
1631 to the table form making it both suitable for fast searches and
1632 small enough that its feasable to store for the duration of a program.
1633
1634 See the comment in the code where the compressed table is produced
1635 inplace from the flat tabe representation for an explanation of how
1636 the compression works.
1637
1638 */
1639
1640
2e64971a
DM
1641 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1642 prev_states[1] = 0;
1643
3dab1dad 1644 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1645 /*
1646 Second Pass -- Array Of Lists Representation
1647
1648 Each state will be represented by a list of charid:state records
1649 (reg_trie_trans_le) the first such element holds the CUR and LEN
1650 points of the allocated array. (See defines above).
1651
1652 We build the initial structure using the lists, and then convert
1653 it into the compressed table form which allows faster lookups
1654 (but cant be modified once converted).
a3621e74
YO
1655 */
1656
a3621e74
YO
1657 STRLEN transcount = 1;
1658
1e2e3d02
YO
1659 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1660 "%*sCompiling trie using list compiler\n",
1661 (int)depth * 2 + 2, ""));
446bd890 1662
c944940b
JH
1663 trie->states = (reg_trie_state *)
1664 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1665 sizeof(reg_trie_state) );
a3621e74
YO
1666 TRIE_LIST_NEW(1);
1667 next_alloc = 2;
1668
1669 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1670
c445ea15
AL
1671 regnode * const noper = NEXTOPER( cur );
1672 U8 *uc = (U8*)STRING( noper );
1673 const U8 * const e = uc + STR_LEN( noper );
1674 U32 state = 1; /* required init */
1675 U16 charid = 0; /* sanity init */
1676 U8 *scan = (U8*)NULL; /* sanity init */
1677 STRLEN foldlen = 0; /* required init */
07be1b83 1678 U32 wordlen = 0; /* required init */
c445ea15
AL
1679 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1680
3dab1dad 1681 if (OP(noper) != NOTHING) {
786e8c11 1682 for ( ; uc < e ; uc += len ) {
c445ea15 1683
786e8c11 1684 TRIE_READ_CHAR;
c445ea15 1685
786e8c11
YO
1686 if ( uvc < 256 ) {
1687 charid = trie->charmap[ uvc ];
c445ea15 1688 } else {
55eed653 1689 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1690 if ( !svpp ) {
1691 charid = 0;
1692 } else {
1693 charid=(U16)SvIV( *svpp );
1694 }
c445ea15 1695 }
786e8c11
YO
1696 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1697 if ( charid ) {
a3621e74 1698
786e8c11
YO
1699 U16 check;
1700 U32 newstate = 0;
a3621e74 1701
786e8c11
YO
1702 charid--;
1703 if ( !trie->states[ state ].trans.list ) {
1704 TRIE_LIST_NEW( state );
c445ea15 1705 }
786e8c11
YO
1706 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1707 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1708 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1709 break;
1710 }
1711 }
1712 if ( ! newstate ) {
1713 newstate = next_alloc++;
2e64971a 1714 prev_states[newstate] = state;
786e8c11
YO
1715 TRIE_LIST_PUSH( state, charid, newstate );
1716 transcount++;
1717 }
1718 state = newstate;
1719 } else {
1720 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1721 }
a28509cc 1722 }
c445ea15 1723 }
3dab1dad 1724 TRIE_HANDLE_WORD(state);
a3621e74
YO
1725
1726 } /* end second pass */
1727
1e2e3d02
YO
1728 /* next alloc is the NEXT state to be allocated */
1729 trie->statecount = next_alloc;
c944940b
JH
1730 trie->states = (reg_trie_state *)
1731 PerlMemShared_realloc( trie->states,
1732 next_alloc
1733 * sizeof(reg_trie_state) );
a3621e74 1734
3dab1dad 1735 /* and now dump it out before we compress it */
2b8b4781
NC
1736 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1737 revcharmap, next_alloc,
1738 depth+1)
1e2e3d02 1739 );
a3621e74 1740
c944940b
JH
1741 trie->trans = (reg_trie_trans *)
1742 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1743 {
1744 U32 state;
a3621e74
YO
1745 U32 tp = 0;
1746 U32 zp = 0;
1747
1748
1749 for( state=1 ; state < next_alloc ; state ++ ) {
1750 U32 base=0;
1751
1752 /*
1753 DEBUG_TRIE_COMPILE_MORE_r(
1754 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1755 );
1756 */
1757
1758 if (trie->states[state].trans.list) {
1759 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1760 U16 maxid=minid;
a28509cc 1761 U16 idx;
a3621e74
YO
1762
1763 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1764 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1765 if ( forid < minid ) {
1766 minid=forid;
1767 } else if ( forid > maxid ) {
1768 maxid=forid;
1769 }
a3621e74
YO
1770 }
1771 if ( transcount < tp + maxid - minid + 1) {
1772 transcount *= 2;
c944940b
JH
1773 trie->trans = (reg_trie_trans *)
1774 PerlMemShared_realloc( trie->trans,
446bd890
NC
1775 transcount
1776 * sizeof(reg_trie_trans) );
a3621e74
YO
1777 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1778 }
1779 base = trie->uniquecharcount + tp - minid;
1780 if ( maxid == minid ) {
1781 U32 set = 0;
1782 for ( ; zp < tp ; zp++ ) {
1783 if ( ! trie->trans[ zp ].next ) {
1784 base = trie->uniquecharcount + zp - minid;
1785 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1786 trie->trans[ zp ].check = state;
1787 set = 1;
1788 break;
1789 }
1790 }
1791 if ( !set ) {
1792 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1793 trie->trans[ tp ].check = state;
1794 tp++;
1795 zp = tp;
1796 }
1797 } else {
1798 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1799 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1800 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1801 trie->trans[ tid ].check = state;
1802 }
1803 tp += ( maxid - minid + 1 );
1804 }
1805 Safefree(trie->states[ state ].trans.list);
1806 }
1807 /*
1808 DEBUG_TRIE_COMPILE_MORE_r(
1809 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1810 );
1811 */
1812 trie->states[ state ].trans.base=base;
1813 }
cc601c31 1814 trie->lasttrans = tp + 1;
a3621e74
YO
1815 }
1816 } else {
1817 /*
1818 Second Pass -- Flat Table Representation.
1819
1820 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1821 We know that we will need Charcount+1 trans at most to store the data
1822 (one row per char at worst case) So we preallocate both structures
1823 assuming worst case.
1824
1825 We then construct the trie using only the .next slots of the entry
1826 structs.
1827
3b753521 1828 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1829 make compression both faster and easier by keeping track of how many non
1830 zero fields are in the node.
1831
1832 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1833 transition.
1834
1835 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1836 number representing the first entry of the node, and state as a
1837 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1838 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1839 are 2 entrys per node. eg:
1840
1841 A B A B
1842 1. 2 4 1. 3 7
1843 2. 0 3 3. 0 5
1844 3. 0 0 5. 0 0
1845 4. 0 0 7. 0 0
1846
1847 The table is internally in the right hand, idx form. However as we also
1848 have to deal with the states array which is indexed by nodenum we have to
1849 use TRIE_NODENUM() to convert.
1850
1851 */
1e2e3d02
YO
1852 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1853 "%*sCompiling trie using table compiler\n",
1854 (int)depth * 2 + 2, ""));
3dab1dad 1855
c944940b
JH
1856 trie->trans = (reg_trie_trans *)
1857 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1858 * trie->uniquecharcount + 1,
1859 sizeof(reg_trie_trans) );
1860 trie->states = (reg_trie_state *)
1861 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1862 sizeof(reg_trie_state) );
a3621e74
YO
1863 next_alloc = trie->uniquecharcount + 1;
1864
3dab1dad 1865
a3621e74
YO
1866 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1867
c445ea15 1868 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1869 const U8 *uc = (U8*)STRING( noper );
1870 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1871
1872 U32 state = 1; /* required init */
1873
1874 U16 charid = 0; /* sanity init */
1875 U32 accept_state = 0; /* sanity init */
1876 U8 *scan = (U8*)NULL; /* sanity init */
1877
1878 STRLEN foldlen = 0; /* required init */
07be1b83 1879 U32 wordlen = 0; /* required init */
a3621e74
YO
1880 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1881
3dab1dad 1882 if ( OP(noper) != NOTHING ) {
786e8c11 1883 for ( ; uc < e ; uc += len ) {
a3621e74 1884
786e8c11 1885 TRIE_READ_CHAR;
a3621e74 1886
786e8c11
YO
1887 if ( uvc < 256 ) {
1888 charid = trie->charmap[ uvc ];
1889 } else {
55eed653 1890 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1891 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1892 }
786e8c11
YO
1893 if ( charid ) {
1894 charid--;
1895 if ( !trie->trans[ state + charid ].next ) {
1896 trie->trans[ state + charid ].next = next_alloc;
1897 trie->trans[ state ].check++;
2e64971a
DM
1898 prev_states[TRIE_NODENUM(next_alloc)]
1899 = TRIE_NODENUM(state);
786e8c11
YO
1900 next_alloc += trie->uniquecharcount;
1901 }
1902 state = trie->trans[ state + charid ].next;
1903 } else {
1904 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1905 }
1906 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1907 }
a3621e74 1908 }
3dab1dad
YO
1909 accept_state = TRIE_NODENUM( state );
1910 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1911
1912 } /* end second pass */
1913
3dab1dad 1914 /* and now dump it out before we compress it */
2b8b4781
NC
1915 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1916 revcharmap,
1917 next_alloc, depth+1));
a3621e74 1918
a3621e74
YO
1919 {
1920 /*
1921 * Inplace compress the table.*
1922
1923 For sparse data sets the table constructed by the trie algorithm will
1924 be mostly 0/FAIL transitions or to put it another way mostly empty.
1925 (Note that leaf nodes will not contain any transitions.)
1926
1927 This algorithm compresses the tables by eliminating most such
1928 transitions, at the cost of a modest bit of extra work during lookup:
1929
1930 - Each states[] entry contains a .base field which indicates the
1931 index in the state[] array wheres its transition data is stored.
1932
3b753521 1933 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1934
1935 - If .base is nonzero then charid is added to it to find an entry in
1936 the trans array.
1937
1938 -If trans[states[state].base+charid].check!=state then the
1939 transition is taken to be a 0/Fail transition. Thus if there are fail
1940 transitions at the front of the node then the .base offset will point
1941 somewhere inside the previous nodes data (or maybe even into a node
1942 even earlier), but the .check field determines if the transition is
1943 valid.
1944
786e8c11 1945 XXX - wrong maybe?
a3621e74 1946 The following process inplace converts the table to the compressed
3b753521 1947 table: We first do not compress the root node 1,and mark all its
a3621e74 1948 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1949 allows us to do a DFA construction from the compressed table later,
1950 and ensures that any .base pointers we calculate later are greater
1951 than 0.
a3621e74
YO
1952
1953 - We set 'pos' to indicate the first entry of the second node.
1954
1955 - We then iterate over the columns of the node, finding the first and
1956 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1957 and set the .check pointers accordingly, and advance pos
1958 appropriately and repreat for the next node. Note that when we copy
1959 the next pointers we have to convert them from the original
1960 NODEIDX form to NODENUM form as the former is not valid post
1961 compression.
1962
1963 - If a node has no transitions used we mark its base as 0 and do not
1964 advance the pos pointer.
1965
1966 - If a node only has one transition we use a second pointer into the
1967 structure to fill in allocated fail transitions from other states.
1968 This pointer is independent of the main pointer and scans forward
1969 looking for null transitions that are allocated to a state. When it
1970 finds one it writes the single transition into the "hole". If the
786e8c11 1971 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1972
1973 - Once compressed we can Renew/realloc the structures to release the
1974 excess space.
1975
1976 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1977 specifically Fig 3.47 and the associated pseudocode.
1978
1979 demq
1980 */
a3b680e6 1981 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1982 U32 state, charid;
a3621e74 1983 U32 pos = 0, zp=0;
1e2e3d02 1984 trie->statecount = laststate;
a3621e74
YO
1985
1986 for ( state = 1 ; state < laststate ; state++ ) {
1987 U8 flag = 0;
a28509cc
AL
1988 const U32 stateidx = TRIE_NODEIDX( state );
1989 const U32 o_used = trie->trans[ stateidx ].check;
1990 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1991 trie->trans[ stateidx ].check = 0;
1992
1993 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1994 if ( flag || trie->trans[ stateidx + charid ].next ) {
1995 if ( trie->trans[ stateidx + charid ].next ) {
1996 if (o_used == 1) {
1997 for ( ; zp < pos ; zp++ ) {
1998 if ( ! trie->trans[ zp ].next ) {
1999 break;
2000 }
2001 }
2002 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2003 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2004 trie->trans[ zp ].check = state;
2005 if ( ++zp > pos ) pos = zp;
2006 break;
2007 }
2008 used--;
2009 }
2010 if ( !flag ) {
2011 flag = 1;
2012 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2013 }
2014 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2015 trie->trans[ pos ].check = state;
2016 pos++;
2017 }
2018 }
2019 }
cc601c31 2020 trie->lasttrans = pos + 1;
c944940b
JH
2021 trie->states = (reg_trie_state *)
2022 PerlMemShared_realloc( trie->states, laststate
2023 * sizeof(reg_trie_state) );
a3621e74 2024 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2025 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2026 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2027 (int)depth * 2 + 2,"",
2028 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2029 (IV)next_alloc,
2030 (IV)pos,
a3621e74
YO
2031 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2032 );
2033
2034 } /* end table compress */
2035 }
1e2e3d02
YO
2036 DEBUG_TRIE_COMPILE_MORE_r(
2037 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2038 (int)depth * 2 + 2, "",
2039 (UV)trie->statecount,
2040 (UV)trie->lasttrans)
2041 );
cc601c31 2042 /* resize the trans array to remove unused space */
c944940b
JH
2043 trie->trans = (reg_trie_trans *)
2044 PerlMemShared_realloc( trie->trans, trie->lasttrans
2045 * sizeof(reg_trie_trans) );
a3621e74 2046
3b753521 2047 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2048 U8 nodetype =(U8)(flags & 0xFF);
2049 char *str=NULL;
786e8c11 2050
07be1b83 2051#ifdef DEBUGGING
e62cc96a 2052 regnode *optimize = NULL;
7122b237
YO
2053#ifdef RE_TRACK_PATTERN_OFFSETS
2054
b57a0404
JH
2055 U32 mjd_offset = 0;
2056 U32 mjd_nodelen = 0;
7122b237
YO
2057#endif /* RE_TRACK_PATTERN_OFFSETS */
2058#endif /* DEBUGGING */
a3621e74 2059 /*
3dab1dad
YO
2060 This means we convert either the first branch or the first Exact,
2061 depending on whether the thing following (in 'last') is a branch
2062 or not and whther first is the startbranch (ie is it a sub part of
2063 the alternation or is it the whole thing.)
3b753521 2064 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2065 the whole branch sequence, including the first.
a3621e74 2066 */
3dab1dad 2067 /* Find the node we are going to overwrite */
7f69552c 2068 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2069 /* branch sub-chain */
3dab1dad 2070 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2071#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2072 DEBUG_r({
2073 mjd_offset= Node_Offset((convert));
2074 mjd_nodelen= Node_Length((convert));
2075 });
7122b237 2076#endif
7f69552c 2077 /* whole branch chain */
7122b237
YO
2078 }
2079#ifdef RE_TRACK_PATTERN_OFFSETS
2080 else {
7f69552c
YO
2081 DEBUG_r({
2082 const regnode *nop = NEXTOPER( convert );
2083 mjd_offset= Node_Offset((nop));
2084 mjd_nodelen= Node_Length((nop));
2085 });
07be1b83
YO
2086 }
2087 DEBUG_OPTIMISE_r(
2088 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2089 (int)depth * 2 + 2, "",
786e8c11 2090 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2091 );
7122b237 2092#endif
3dab1dad
YO
2093 /* But first we check to see if there is a common prefix we can
2094 split out as an EXACT and put in front of the TRIE node. */
2095 trie->startstate= 1;
55eed653 2096 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2097 U32 state;
1e2e3d02 2098 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2099 U32 ofs = 0;
8e11feef
RGS
2100 I32 idx = -1;
2101 U32 count = 0;
2102 const U32 base = trie->states[ state ].trans.base;
a3621e74 2103
3dab1dad 2104 if ( trie->states[state].wordnum )
8e11feef 2105 count = 1;
a3621e74 2106
8e11feef 2107 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2108 if ( ( base + ofs >= trie->uniquecharcount ) &&
2109 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2110 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2111 {
3dab1dad 2112 if ( ++count > 1 ) {
2b8b4781 2113 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2114 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2115 if ( state == 1 ) break;
3dab1dad
YO
2116 if ( count == 2 ) {
2117 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2118 DEBUG_OPTIMISE_r(
8e11feef
RGS
2119 PerlIO_printf(Perl_debug_log,
2120 "%*sNew Start State=%"UVuf" Class: [",
2121 (int)depth * 2 + 2, "",
786e8c11 2122 (UV)state));
be8e71aa 2123 if (idx >= 0) {
2b8b4781 2124 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2125 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2126
3dab1dad 2127 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2128 if ( folder )
2129 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2130 DEBUG_OPTIMISE_r(
f1f66076 2131 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2132 );
8e11feef
RGS
2133 }
2134 }
2135 TRIE_BITMAP_SET(trie,*ch);
2136 if ( folder )
2137 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2138 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2139 }
2140 idx = ofs;
2141 }
3dab1dad
YO
2142 }
2143 if ( count == 1 ) {
2b8b4781 2144 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2145 STRLEN len;
2146 char *ch = SvPV( *tmp, len );
de734bd5
A
2147 DEBUG_OPTIMISE_r({
2148 SV *sv=sv_newmortal();
8e11feef
RGS
2149 PerlIO_printf( Perl_debug_log,
2150 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2151 (int)depth * 2 + 2, "",
de734bd5
A
2152 (UV)state, (UV)idx,
2153 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2154 PL_colors[0], PL_colors[1],
2155 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2156 PERL_PV_ESCAPE_FIRSTCHAR
2157 )
2158 );
2159 });
3dab1dad
YO
2160 if ( state==1 ) {
2161 OP( convert ) = nodetype;
2162 str=STRING(convert);
2163 STR_LEN(convert)=0;
2164 }
c490c714
YO
2165 STR_LEN(convert) += len;
2166 while (len--)
de734bd5 2167 *str++ = *ch++;
8e11feef 2168 } else {
f9049ba1 2169#ifdef DEBUGGING
8e11feef
RGS
2170 if (state>1)
2171 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2172#endif
8e11feef
RGS
2173 break;
2174 }
2175 }
2e64971a 2176 trie->prefixlen = (state-1);
3dab1dad 2177 if (str) {
8e11feef 2178 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2179 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2180 trie->startstate = state;
07be1b83
YO
2181 trie->minlen -= (state - 1);
2182 trie->maxlen -= (state - 1);
33809eae
JH
2183#ifdef DEBUGGING
2184 /* At least the UNICOS C compiler choked on this
2185 * being argument to DEBUG_r(), so let's just have
2186 * it right here. */
2187 if (
2188#ifdef PERL_EXT_RE_BUILD
2189 1
2190#else
2191 DEBUG_r_TEST
2192#endif
2193 ) {
2194 regnode *fix = convert;
2195 U32 word = trie->wordcount;
2196 mjd_nodelen++;
2197 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2198 while( ++fix < n ) {
2199 Set_Node_Offset_Length(fix, 0, 0);
2200 }
2201 while (word--) {
2202 SV ** const tmp = av_fetch( trie_words, word, 0 );
2203 if (tmp) {
2204 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2205 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2206 else
2207 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2208 }
2209 }
2210 }
2211#endif
8e11feef
RGS
2212 if (trie->maxlen) {
2213 convert = n;
2214 } else {
3dab1dad 2215 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2216 DEBUG_r(optimize= n);
3dab1dad
YO
2217 }
2218 }
2219 }
a5ca303d
YO
2220 if (!jumper)
2221 jumper = last;
3dab1dad 2222 if ( trie->maxlen ) {
8e11feef
RGS
2223 NEXT_OFF( convert ) = (U16)(tail - convert);
2224 ARG_SET( convert, data_slot );
786e8c11
YO
2225 /* Store the offset to the first unabsorbed branch in
2226 jump[0], which is otherwise unused by the jump logic.
2227 We use this when dumping a trie and during optimisation. */
2228 if (trie->jump)
7f69552c 2229 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2230
6c48061a
YO
2231 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2232 * and there is a bitmap
2233 * and the first "jump target" node we found leaves enough room
2234 * then convert the TRIE node into a TRIEC node, with the bitmap
2235 * embedded inline in the opcode - this is hypothetically faster.
2236 */
2237 if ( !trie->states[trie->startstate].wordnum
2238 && trie->bitmap
2239 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2240 {
2241 OP( convert ) = TRIEC;
2242 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2243 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2244 trie->bitmap= NULL;
2245 } else
2246 OP( convert ) = TRIE;
a3621e74 2247
3dab1dad
YO
2248 /* store the type in the flags */
2249 convert->flags = nodetype;
a5ca303d
YO
2250 DEBUG_r({
2251 optimize = convert
2252 + NODE_STEP_REGNODE
2253 + regarglen[ OP( convert ) ];
2254 });
2255 /* XXX We really should free up the resource in trie now,
2256 as we won't use them - (which resources?) dmq */
3dab1dad 2257 }
a3621e74 2258 /* needed for dumping*/
e62cc96a 2259 DEBUG_r(if (optimize) {
07be1b83 2260 regnode *opt = convert;
bcdf7404 2261
e62cc96a 2262 while ( ++opt < optimize) {
07be1b83
YO
2263 Set_Node_Offset_Length(opt,0,0);
2264 }
786e8c11
YO
2265 /*
2266 Try to clean up some of the debris left after the
2267 optimisation.
a3621e74 2268 */
786e8c11 2269 while( optimize < jumper ) {
07be1b83 2270 mjd_nodelen += Node_Length((optimize));
a3621e74 2271 OP( optimize ) = OPTIMIZED;
07be1b83 2272 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2273 optimize++;
2274 }
07be1b83 2275 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2276 });
2277 } /* end node insert */
2e64971a
DM
2278
2279 /* Finish populating the prev field of the wordinfo array. Walk back
2280 * from each accept state until we find another accept state, and if
2281 * so, point the first word's .prev field at the second word. If the
2282 * second already has a .prev field set, stop now. This will be the
2283 * case either if we've already processed that word's accept state,
3b753521
FN
2284 * or that state had multiple words, and the overspill words were
2285 * already linked up earlier.
2e64971a
DM
2286 */
2287 {
2288 U16 word;
2289 U32 state;
2290 U16 prev;
2291
2292 for (word=1; word <= trie->wordcount; word++) {
2293 prev = 0;
2294 if (trie->wordinfo[word].prev)
2295 continue;
2296 state = trie->wordinfo[word].accept;
2297 while (state) {
2298 state = prev_states[state];
2299 if (!state)
2300 break;
2301 prev = trie->states[state].wordnum;
2302 if (prev)
2303 break;
2304 }
2305 trie->wordinfo[word].prev = prev;
2306 }
2307 Safefree(prev_states);
2308 }
2309
2310
2311 /* and now dump out the compressed format */
2312 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2313
55eed653 2314 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2315#ifdef DEBUGGING
2316 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2317 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2318#else
2319 SvREFCNT_dec(revcharmap);
07be1b83 2320#endif
786e8c11
YO
2321 return trie->jump
2322 ? MADE_JUMP_TRIE
2323 : trie->startstate>1
2324 ? MADE_EXACT_TRIE
2325 : MADE_TRIE;
2326}
2327
2328STATIC void
2329S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2330{
3b753521 2331/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2332
2333 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2334 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2335 ISBN 0-201-10088-6
2336
2337 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2338 suffix of the current state's 'word' that is also a proper prefix of another word in our
2339 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2340 the DFA not to have to restart after its tried and failed a word at a given point, it
2341 simply continues as though it had been matching the other word in the first place.
2342 Consider
2343 'abcdgu'=~/abcdefg|cdgu/
2344 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2345 fail, which would bring us to the state representing 'd' in the second word where we would
2346 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2347 */
2348 /* add a fail transition */
3251b653
NC
2349 const U32 trie_offset = ARG(source);
2350 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2351 U32 *q;
2352 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2353 const U32 numstates = trie->statecount;
786e8c11
YO
2354 const U32 ubound = trie->lasttrans + ucharcount;
2355 U32 q_read = 0;
2356 U32 q_write = 0;
2357 U32 charid;
2358 U32 base = trie->states[ 1 ].trans.base;
2359 U32 *fail;
2360 reg_ac_data *aho;
2361 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2362 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2363
2364 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2365#ifndef DEBUGGING
2366 PERL_UNUSED_ARG(depth);
2367#endif
2368
2369
2370 ARG_SET( stclass, data_slot );
c944940b 2371 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2372 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2373 aho->trie=trie_offset;
446bd890
NC
2374 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2375 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2376 Newxz( q, numstates, U32);
c944940b 2377 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2378 aho->refcount = 1;
2379 fail = aho->fail;
2380 /* initialize fail[0..1] to be 1 so that we always have
2381 a valid final fail state */
2382 fail[ 0 ] = fail[ 1 ] = 1;
2383
2384 for ( charid = 0; charid < ucharcount ; charid++ ) {
2385 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2386 if ( newstate ) {
2387 q[ q_write ] = newstate;
2388 /* set to point at the root */
2389 fail[ q[ q_write++ ] ]=1;
2390 }
2391 }
2392 while ( q_read < q_write) {
2393 const U32 cur = q[ q_read++ % numstates ];
2394 base = trie->states[ cur ].trans.base;
2395
2396 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2397 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2398 if (ch_state) {
2399 U32 fail_state = cur;
2400 U32 fail_base;
2401 do {
2402 fail_state = fail[ fail_state ];
2403 fail_base = aho->states[ fail_state ].trans.base;
2404 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2405
2406 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2407 fail[ ch_state ] = fail_state;
2408 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2409 {
2410 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2411 }
2412 q[ q_write++ % numstates] = ch_state;
2413 }
2414 }
2415 }
2416 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2417 when we fail in state 1, this allows us to use the
2418 charclass scan to find a valid start char. This is based on the principle
2419 that theres a good chance the string being searched contains lots of stuff
2420 that cant be a start char.
2421 */
2422 fail[ 0 ] = fail[ 1 ] = 0;
2423 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2424 PerlIO_printf(Perl_debug_log,
2425 "%*sStclass Failtable (%"UVuf" states): 0",
2426 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2427 );
786e8c11
YO
2428 for( q_read=1; q_read<numstates; q_read++ ) {
2429 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2430 }
2431 PerlIO_printf(Perl_debug_log, "\n");
2432 });
2433 Safefree(q);
2434 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2435}
2436
786e8c11 2437
a3621e74 2438/*
5d1c421c
JH
2439 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2440 * These need to be revisited when a newer toolchain becomes available.
2441 */
2442#if defined(__sparc64__) && defined(__GNUC__)
2443# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2444# undef SPARC64_GCC_WORKAROUND
2445# define SPARC64_GCC_WORKAROUND 1
2446# endif
2447#endif
2448
07be1b83 2449#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2450 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2451 SV * const mysv=sv_newmortal(); \
2452 regnode *Next = regnext(scan); \
2453 regprop(RExC_rx, mysv, scan); \
7f69552c 2454 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2455 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2456 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2457 }});
07be1b83 2458
1de06328
YO
2459
2460
2461
2462
07be1b83
YO
2463#define JOIN_EXACT(scan,min,flags) \
2464 if (PL_regkind[OP(scan)] == EXACT) \
2465 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2466
be8e71aa 2467STATIC U32
07be1b83
YO
2468S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2469 /* Merge several consecutive EXACTish nodes into one. */
2470 regnode *n = regnext(scan);
2471 U32 stringok = 1;
2472 regnode *next = scan + NODE_SZ_STR(scan);
2473 U32 merged = 0;
2474 U32 stopnow = 0;
2475#ifdef DEBUGGING
2476 regnode *stop = scan;
72f13be8 2477 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2478#else
d47053eb
RGS
2479 PERL_UNUSED_ARG(depth);
2480#endif
7918f24d
NC
2481
2482 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2483#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2484 PERL_UNUSED_ARG(flags);
2485 PERL_UNUSED_ARG(val);
07be1b83 2486#endif
07be1b83
YO
2487 DEBUG_PEEP("join",scan,depth);
2488
2489 /* Skip NOTHING, merge EXACT*. */
2490 while (n &&
2491 ( PL_regkind[OP(n)] == NOTHING ||
2492 (stringok && (OP(n) == OP(scan))))
2493 && NEXT_OFF(n)
2494 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2495
2496 if (OP(n) == TAIL || n > next)
2497 stringok = 0;
2498 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2499 DEBUG_PEEP("skip:",n,depth);
2500 NEXT_OFF(scan) += NEXT_OFF(n);
2501 next = n + NODE_STEP_REGNODE;
2502#ifdef DEBUGGING
2503 if (stringok)
2504 stop = n;
2505#endif
2506 n = regnext(n);
2507 }
2508 else if (stringok) {
786e8c11 2509 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2510 regnode * const nnext = regnext(n);
2511
2512 DEBUG_PEEP("merg",n,depth);
2513
2514 merged++;
2515 if (oldl + STR_LEN(n) > U8_MAX)
2516 break;
2517 NEXT_OFF(scan) += NEXT_OFF(n);
2518 STR_LEN(scan) += STR_LEN(n);
2519 next = n + NODE_SZ_STR(n);
2520 /* Now we can overwrite *n : */
2521 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2522#ifdef DEBUGGING
2523 stop = next - 1;
2524#endif
2525 n = nnext;
2526 if (stopnow) break;
2527 }
2528
d47053eb
RGS
2529#ifdef EXPERIMENTAL_INPLACESCAN
2530 if (flags && !NEXT_OFF(n)) {
2531 DEBUG_PEEP("atch", val, depth);
2532 if (reg_off_by_arg[OP(n)]) {
2533 ARG_SET(n, val - n);
2534 }
2535 else {
2536 NEXT_OFF(n) = val - n;
2537 }
2538 stopnow = 1;
2539 }
07be1b83
YO
2540#endif
2541 }
ced7f090
KW
2542#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2543#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2544#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2545#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2546
2547 if (UTF
2f7f8cb1 2548 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2c2b7f86
KW
2549 && ( STR_LEN(scan) >= 6 ) )
2550 {
07be1b83
YO
2551 /*
2552 Two problematic code points in Unicode casefolding of EXACT nodes:
2553
2554 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2555 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2556
2557 which casefold to
2558
2559 Unicode UTF-8
2560
2561 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2562 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2563
2564 This means that in case-insensitive matching (or "loose matching",
2565 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2566 length of the above casefolded versions) can match a target string
2567 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2568 This would rather mess up the minimum length computation.
2569
2570 What we'll do is to look for the tail four bytes, and then peek
2571 at the preceding two bytes to see whether we need to decrease
2572 the minimum length by four (six minus two).
2573
2574 Thanks to the design of UTF-8, there cannot be false matches:
2575 A sequence of valid UTF-8 bytes cannot be a subsequence of
2576 another valid sequence of UTF-8 bytes.
2577
2578 */
2579 char * const s0 = STRING(scan), *s, *t;
2580 char * const s1 = s0 + STR_LEN(scan) - 1;
2581 char * const s2 = s1 - 4;
e294cc5d
JH
2582#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2583 const char t0[] = "\xaf\x49\xaf\x42";
2584#else
07be1b83 2585 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2586#endif
07be1b83
YO
2587 const char * const t1 = t0 + 3;
2588
2589 for (s = s0 + 2;
2590 s < s2 && (t = ninstr(s, s1, t0, t1));
2591 s = t + 4) {
e294cc5d
JH
2592#ifdef EBCDIC
2593 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2594 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2595#else
07be1b83
YO
2596 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2597 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2598#endif
07be1b83
YO
2599 *min -= 4;
2600 }
2601 }
2602
2603#ifdef DEBUGGING
2604 /* Allow dumping */
2605 n = scan + NODE_SZ_STR(scan);
2606 while (n <= stop) {
2607 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2608 OP(n) = OPTIMIZED;
2609 NEXT_OFF(n) = 0;
2610 }
2611 n++;
2612 }
2613#endif
2614 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2615 return stopnow;
2616}
2617
486ec47a 2618/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2619 Finds fixed substrings. */
2620
a0288114 2621/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2622 to the position after last scanned or to NULL. */
2623
40d049e4
YO
2624#define INIT_AND_WITHP \
2625 assert(!and_withp); \
2626 Newx(and_withp,1,struct regnode_charclass_class); \
2627 SAVEFREEPV(and_withp)
07be1b83 2628
b515a41d 2629/* this is a chain of data about sub patterns we are processing that
486ec47a 2630 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2631 we can simulate recursion without losing state. */
2632struct scan_frame;
2633typedef struct scan_frame {
2634 regnode *last; /* last node to process in this frame */
2635 regnode *next; /* next node to process when last is reached */
2636 struct scan_frame *prev; /*previous frame*/
2637 I32 stop; /* what stopparen do we use */
2638} scan_frame;
2639
304ee84b
YO
2640
2641#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2642
e1d1eefb
YO
2643#define CASE_SYNST_FNC(nAmE) \
2644case nAmE: \
2645 if (flags & SCF_DO_STCLASS_AND) { \
2646 for (value = 0; value < 256; value++) \
2647 if (!is_ ## nAmE ## _cp(value)) \
2648 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2649 } \
2650 else { \
2651 for (value = 0; value < 256; value++) \
2652 if (is_ ## nAmE ## _cp(value)) \
2653 ANYOF_BITMAP_SET(data->start_class, value); \
2654 } \
2655 break; \
2656case N ## nAmE: \
2657 if (flags & SCF_DO_STCLASS_AND) { \
2658 for (value = 0; value < 256; value++) \
2659 if (is_ ## nAmE ## _cp(value)) \
2660 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2661 } \
2662 else { \
2663 for (value = 0; value < 256; value++) \
2664 if (!is_ ## nAmE ## _cp(value)) \
2665 ANYOF_BITMAP_SET(data->start_class, value); \
2666 } \
2667 break
2668
2669
2670
76e3520e 2671STATIC I32
40d049e4 2672S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2673 I32 *minlenp, I32 *deltap,
40d049e4
YO
2674 regnode *last,
2675 scan_data_t *data,
2676 I32 stopparen,
2677 U8* recursed,
2678 struct regnode_charclass_class *and_withp,
2679 U32 flags, U32 depth)
c277df42
IZ
2680 /* scanp: Start here (read-write). */
2681 /* deltap: Write maxlen-minlen here. */
2682 /* last: Stop before this one. */
40d049e4
YO
2683 /* data: string data about the pattern */
2684 /* stopparen: treat close N as END */
2685 /* recursed: which subroutines have we recursed into */
2686 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2687{
97aff369 2688 dVAR;
c277df42
IZ
2689 I32 min = 0, pars = 0, code;
2690 regnode *scan = *scanp, *next;
2691 I32 delta = 0;
2692 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2693 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2694 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2695 scan_data_t data_fake;
a3621e74 2696 SV *re_trie_maxbuff = NULL;
786e8c11 2697 regnode *first_non_open = scan;
e2e6a0f1 2698 I32 stopmin = I32_MAX;
8aa23a47 2699 scan_frame *frame = NULL;
a3621e74 2700 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2701
7918f24d
NC
2702 PERL_ARGS_ASSERT_STUDY_CHUNK;
2703
13a24bad 2704#ifdef DEBUGGING
40d049e4 2705 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2706#endif
40d049e4 2707
786e8c11 2708 if ( depth == 0 ) {
40d049e4 2709 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2710 first_non_open=regnext(first_non_open);
2711 }
2712
b81d288d 2713
8aa23a47
YO
2714 fake_study_recurse:
2715 while ( scan && OP(scan) != END && scan < last ){
2716 /* Peephole optimizer: */
304ee84b 2717 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2718 DEBUG_PEEP("Peep",scan,depth);
2719 JOIN_EXACT(scan,&min,0);
2720
2721 /* Follow the next-chain of the current node and optimize
2722 away all the NOTHINGs from it. */
2723 if (OP(scan) != CURLYX) {
2724 const int max = (reg_off_by_arg[OP(scan)]
2725 ? I32_MAX
2726 /* I32 may be smaller than U16 on CRAYs! */
2727 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2728 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2729 int noff;
2730 regnode *n = scan;
2731
2732 /* Skip NOTHING and LONGJMP. */
2733 while ((n = regnext(n))
2734 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2735 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2736 && off + noff < max)
2737 off += noff;
2738 if (reg_off_by_arg[OP(scan)])
2739 ARG(scan) = off;
2740 else
2741 NEXT_OFF(scan) = off;
2742 }
a3621e74 2743
c277df42 2744
8aa23a47
YO
2745
2746 /* The principal pseudo-switch. Cannot be a switch, since we
2747 look into several different things. */
2748 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2749 || OP(scan) == IFTHEN) {
2750 next = regnext(scan);
2751 code = OP(scan);
2752 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2753
2754 if (OP(next) == code || code == IFTHEN) {
2755 /* NOTE - There is similar code to this block below for handling
2756 TRIE nodes on a re-study. If you change stuff here check there
2757 too. */
2758 I32 max1 = 0, min1 = I32_MAX, num = 0;
2759 struct regnode_charclass_class accum;
2760 regnode * const startbranch=scan;
2761
2762 if (flags & SCF_DO_SUBSTR)
304ee84b 2763 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 2764 if (flags & SCF_DO_STCLASS)
e755fd73 2765 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
2766
2767 while (OP(scan) == code) {
2768 I32 deltanext, minnext, f = 0, fake;
2769 struct regnode_charclass_class this_class;
2770
2771 num++;
2772 data_fake.flags = 0;
2773 if (data) {
2774 data_fake.whilem_c = data->whilem_c;
2775 data_fake.last_closep = data->last_closep;
2776 }
2777 else
2778 data_fake.last_closep = &fake;
58e23c8d
YO
2779
2780 data_fake.pos_delta = delta;
8aa23a47
YO
2781 next = regnext(scan);
2782 scan = NEXTOPER(scan);
2783 if (code != BRANCH)
c277df42 2784 scan = NEXTOPER(scan);
8aa23a47 2785 if (flags & SCF_DO_STCLASS) {
e755fd73 2786 cl_init(pRExC_state, &this_class);
8aa23a47
YO
2787 data_fake.start_class = &this_class;
2788 f = SCF_DO_STCLASS_AND;
58e23c8d 2789 }
8aa23a47
YO
2790 if (flags & SCF_WHILEM_VISITED_POS)
2791 f |= SCF_WHILEM_VISITED_POS;
2792
2793 /* we suppose the run is continuous, last=next...*/
2794 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2795 next, &data_fake,
2796 stopparen, recursed, NULL, f,depth+1);
2797 if (min1 > minnext)
2798 min1 = minnext;
2799 if (max1 < minnext + deltanext)
2800 max1 = minnext + deltanext;
2801 if (deltanext == I32_MAX)
2802 is_inf = is_inf_internal = 1;
2803 scan = next;
2804 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2805 pars++;
2806 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2807 if ( stopmin > minnext)
2808 stopmin = min + min1;
2809 flags &= ~SCF_DO_SUBSTR;
2810 if (data)
2811 data->flags |= SCF_SEEN_ACCEPT;
2812 }
2813 if (data) {
2814 if (data_fake.flags & SF_HAS_EVAL)
2815 data->flags |= SF_HAS_EVAL;
2816 data->whilem_c = data_fake.whilem_c;
3dab1dad 2817 }
8aa23a47 2818 if (flags & SCF_DO_STCLASS)
3fffb88a 2819 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
2820 }
2821 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2822 min1 = 0;
2823 if (flags & SCF_DO_SUBSTR) {
2824 data->pos_min += min1;
2825 data->pos_delta += max1 - min1;
2826 if (max1 != min1 || is_inf)
2827 data->longest = &(data->longest_float);
2828 }
2829 min += min1;
2830 delta += max1 - min1;
2831 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 2832 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
2833 if (min1) {
2834 cl_and(data->start_class, and_withp);
2835 flags &= ~SCF_DO_STCLASS;
653099ff 2836 }
8aa23a47
YO
2837 }
2838 else if (flags & SCF_DO_STCLASS_AND) {
2839 if (min1) {
2840 cl_and(data->start_class, &accum);
2841 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2842 }
8aa23a47
YO
2843 else {
2844 /* Switch to OR mode: cache the old value of
2845 * data->start_class */
2846 INIT_AND_WITHP;
2847 StructCopy(data->start_class, and_withp,
2848 struct regnode_charclass_class);
2849 flags &= ~SCF_DO_STCLASS_AND;
2850 StructCopy(&accum, data->start_class,
2851 struct regnode_charclass_class);
2852 flags |= SCF_DO_STCLASS_OR;
2853 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2854 }
8aa23a47 2855 }
a3621e74 2856
8aa23a47
YO
2857 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2858 /* demq.
a3621e74 2859
8aa23a47
YO
2860 Assuming this was/is a branch we are dealing with: 'scan' now
2861 points at the item that follows the branch sequence, whatever
2862 it is. We now start at the beginning of the sequence and look
2863 for subsequences of
a3621e74 2864
8aa23a47
YO
2865 BRANCH->EXACT=>x1
2866 BRANCH->EXACT=>x2
2867 tail
a3621e74 2868
8aa23a47 2869 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2870
486ec47a 2871 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2872 element into a trie and then add the subsequent branch exact
2873 strings to the trie.
a3621e74 2874
8aa23a47 2875 We have two cases
a3621e74 2876
3b753521 2877 1. patterns where the whole set of branches can be converted.
a3621e74 2878
8aa23a47 2879 2. patterns where only a subset can be converted.
a3621e74 2880
8aa23a47
YO
2881 In case 1 we can replace the whole set with a single regop
2882 for the trie. In case 2 we need to keep the start and end
3b753521 2883 branches so
a3621e74 2884
8aa23a47
YO
2885 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2886 becomes BRANCH TRIE; BRANCH X;
786e8c11 2887
8aa23a47
YO
2888 There is an additional case, that being where there is a
2889 common prefix, which gets split out into an EXACT like node
2890 preceding the TRIE node.
a3621e74 2891
8aa23a47
YO
2892 If x(1..n)==tail then we can do a simple trie, if not we make
2893 a "jump" trie, such that when we match the appropriate word
486ec47a 2894 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2895 a nested if into a case structure of sorts.
b515a41d 2896
8aa23a47
YO
2897 */
2898
2899 int made=0;
2900 if (!re_trie_maxbuff) {
2901 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2902 if (!SvIOK(re_trie_maxbuff))
2903 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2904 }
2905 if ( SvIV(re_trie_maxbuff)>=0 ) {
2906 regnode *cur;
2907 regnode *first = (regnode *)NULL;
2908 regnode *last = (regnode *)NULL;
2909 regnode *tail = scan;
2910 U8 optype = 0;
2911 U32 count=0;
a3621e74
YO
2912
2913#ifdef DEBUGGING
8aa23a47 2914 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2915#endif
8aa23a47
YO
2916 /* var tail is used because there may be a TAIL
2917 regop in the way. Ie, the exacts will point to the
2918 thing following the TAIL, but the last branch will
2919 point at the TAIL. So we advance tail. If we
2920 have nested (?:) we may have to move through several
2921 tails.
2922 */
2923
2924 while ( OP( tail ) == TAIL ) {
2925 /* this is the TAIL generated by (?:) */
2926 tail = regnext( tail );
2927 }
a3621e74 2928
8aa23a47
YO
2929
2930 DEBUG_OPTIMISE_r({
2931 regprop(RExC_rx, mysv, tail );
2932 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2933 (int)depth * 2 + 2, "",
2934 "Looking for TRIE'able sequences. Tail node is: ",
2935 SvPV_nolen_const( mysv )
2936 );
2937 });
2938
2939 /*
2940
2941 step through the branches, cur represents each
2942 branch, noper is the first thing to be matched
2943 as part of that branch and noper_next is the
2944 regnext() of that node. if noper is an EXACT
2945 and noper_next is the same as scan (our current
2946 position in the regex) then the EXACT branch is
2947 a possible optimization target. Once we have
486ec47a 2948 two or more consecutive such branches we can
8aa23a47
YO
2949 create a trie of the EXACT's contents and stich
2950 it in place. If the sequence represents all of
2951 the branches we eliminate the whole thing and
2952 replace it with a single TRIE. If it is a
2953 subsequence then we need to stitch it in. This
2954 means the first branch has to remain, and needs
2955 to be repointed at the item on the branch chain
2956 following the last branch optimized. This could
2957 be either a BRANCH, in which case the
2958 subsequence is internal, or it could be the
2959 item following the branch sequence in which
2960 case the subsequence is at the end.
2961
2962 */
2963
2964 /* dont use tail as the end marker for this traverse */
2965 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2966 regnode * const noper = NEXTOPER( cur );
b515a41d 2967#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2968 regnode * const noper_next = regnext( noper );
b515a41d
YO
2969#endif
2970
8aa23a47
YO
2971 DEBUG_OPTIMISE_r({
2972 regprop(RExC_rx, mysv, cur);
2973 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2974 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2975
2976 regprop(RExC_rx, mysv, noper);
2977 PerlIO_printf( Perl_debug_log, " -> %s",
2978 SvPV_nolen_const(mysv));
2979
2980 if ( noper_next ) {
2981 regprop(RExC_rx, mysv, noper_next );
2982 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2983 SvPV_nolen_const(mysv));
2984 }
2985 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2986 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2987 });
2988 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2989 : PL_regkind[ OP( noper ) ] == EXACT )
2990 || OP(noper) == NOTHING )
786e8c11 2991#ifdef NOJUMPTRIE
8aa23a47 2992 && noper_next == tail
786e8c11 2993#endif
8aa23a47
YO
2994 && count < U16_MAX)
2995 {
2996 count++;
2997 if ( !first || optype == NOTHING ) {
2998 if (!first) first = cur;
2999 optype = OP( noper );
3000 } else {
3001 last = cur;
3002 }
3003 } else {
a0a388a1 3004/*
0abd0d78
YO
3005 Currently we do not believe that the trie logic can
3006 handle case insensitive matching properly when the
3007 pattern is not unicode (thus forcing unicode semantics).
3008
3009 If/when this is fixed the following define can be swapped
3010 in below to fully enable trie logic.
3011
f0c16e54
KW
3012 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3013 not /aa
3014
a0a388a1 3015#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
3016
3017*/
f0c16e54 3018#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
0abd0d78 3019
a0a388a1 3020 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
3021 make_trie( pRExC_state,
3022 startbranch, first, cur, tail, count,
3023 optype, depth+1 );
3024 }
3025 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 3026#ifdef NOJUMPTRIE
8aa23a47 3027 && noper_next == tail
786e8c11 3028#endif
8aa23a47
YO
3029 ){
3030 count = 1;
3031 first = cur;
3032 optype = OP( noper );
3033 } else {
3034 count = 0;
3035 first = NULL;
3036 optype = 0;
3037 }
3038 last = NULL;
3039 }
3040 }
3041 DEBUG_OPTIMISE_r({
3042 regprop(RExC_rx, mysv, cur);
3043 PerlIO_printf( Perl_debug_log,
3044 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3045 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3046
3047 });
a0a388a1
YO
3048
3049 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 3050 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 3051#ifdef TRIE_STUDY_OPT
8aa23a47
YO
3052 if ( ((made == MADE_EXACT_TRIE &&
3053 startbranch == first)
3054 || ( first_non_open == first )) &&
3055 depth==0 ) {
3056 flags |= SCF_TRIE_RESTUDY;
3057 if ( startbranch == first
3058 && scan == tail )
3059 {
3060 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3061 }
3062 }
3dab1dad 3063#endif
8aa23a47
YO
3064 }
3065 }
3066
3067 } /* do trie */
3068
653099ff 3069 }
8aa23a47
YO
3070 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3071 scan = NEXTOPER(NEXTOPER(scan));
3072 } else /* single branch is optimized. */
3073 scan = NEXTOPER(scan);
3074 continue;
3075 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3076 scan_frame *newframe = NULL;
3077 I32 paren;
3078 regnode *start;
3079 regnode *end;
3080
3081 if (OP(scan) != SUSPEND) {
3082 /* set the pointer */
3083 if (OP(scan) == GOSUB) {
3084 paren = ARG(scan);
3085 RExC_recurse[ARG2L(scan)] = scan;
3086 start = RExC_open_parens[paren-1];
3087 end = RExC_close_parens[paren-1];
3088 } else {
3089 paren = 0;
f8fc2ecf 3090 start = RExC_rxi->program + 1;
8aa23a47
YO
3091 end = RExC_opend;
3092 }
3093 if (!recursed) {
3094 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3095 SAVEFREEPV(recursed);
3096 }
3097 if (!PAREN_TEST(recursed,paren+1)) {
3098 PAREN_SET(recursed,paren+1);
3099 Newx(newframe,1,scan_frame);
3100 } else {
3101 if (flags & SCF_DO_SUBSTR) {
304ee84b 3102 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3103 data->longest = &(data->longest_float);
3104 }
3105 is_inf = is_inf_internal = 1;
3106 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3107 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3108 flags &= ~SCF_DO_STCLASS;
3109 }
3110 } else {
3111 Newx(newframe,1,scan_frame);
3112 paren = stopparen;
3113 start = scan+2;
3114 end = regnext(scan);
3115 }
3116 if (newframe) {
3117 assert(start);
3118 assert(end);
3119 SAVEFREEPV(newframe);
3120 newframe->next = regnext(scan);
3121 newframe->last = last;
3122 newframe->stop = stopparen;
3123 newframe->prev = frame;
3124
3125 frame = newframe;
3126 scan = start;
3127 stopparen = paren;
3128 last = end;
3129
3130 continue;
3131 }
3132 }
3133 else if (OP(scan) == EXACT) {
3134 I32 l = STR_LEN(scan);
3135 UV uc;
3136 if (UTF) {
3137 const U8 * const s = (U8*)STRING(scan);
3138 l = utf8_length(s, s + l);
3139 uc = utf8_to_uvchr(s, NULL);
3140 } else {
3141 uc = *((U8*)STRING(scan));
3142 }
3143 min += l;
3144 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3145 /* The code below prefers earlier match for fixed
3146 offset, later match for variable offset. */
3147 if (data->last_end == -1) { /* Update the start info. */
3148 data->last_start_min = data->pos_min;
3149 data->last_start_max = is_inf
3150 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3151 }
8aa23a47
YO
3152 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3153 if (UTF)
3154 SvUTF8_on(data->last_found);
3155 {
3156 SV * const sv = data->last_found;
3157 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3158 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3159 if (mg && mg->mg_len >= 0)
3160 mg->mg_len += utf8_length((U8*)STRING(scan),
3161 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3162 }
8aa23a47
YO
3163 data->last_end = data->pos_min + l;
3164 data->pos_min += l; /* As in the first entry. */
3165 data->flags &= ~SF_BEFORE_EOL;
3166 }
3167 if (flags & SCF_DO_STCLASS_AND) {
3168 /* Check whether it is compatible with what we know already! */
3169 int compat = 1;
3170
54251c2e 3171
486ec47a 3172 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3173 * in the bitmp and either 1) its bit or its fold is set, or 2)
3174 * it's for a locale. Even if there isn't unicode semantics
3175 * here, at runtime there may be because of matching against a
3176 * utf8 string, so accept a possible false positive for
3177 * latin1-range folds */
8aa23a47
YO
3178 if (uc >= 0x100 ||
3179 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3180 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3181 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3182 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3183 )
d18bf9dc 3184 {
8aa23a47 3185 compat = 0;
d18bf9dc 3186 }
8aa23a47
YO
3187 ANYOF_CLASS_ZERO(data->start_class);
3188 ANYOF_BITMAP_ZERO(data->start_class);
3189 if (compat)
3190 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3191 else if (uc >= 0x100) {
3192 int i;
3193
3194 /* Some Unicode code points fold to the Latin1 range; as
3195 * XXX temporary code, instead of figuring out if this is
3196 * one, just assume it is and set all the start class bits
3197 * that could be some such above 255 code point's fold
3198 * which will generate fals positives. As the code
3199 * elsewhere that does compute the fold settles down, it
3200 * can be extracted out and re-used here */
3201 for (i = 0; i < 256; i++){
3202 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3203 ANYOF_BITMAP_SET(data->start_class, i);
3204 }
3205 }
3206 }
8aa23a47
YO
3207 data->start_class->flags &= ~ANYOF_EOS;
3208 if (uc < 0x100)
3209 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3210 }
3211 else if (flags & SCF_DO_STCLASS_OR) {
3212 /* false positive possible if the class is case-folded */
3213 if (uc < 0x100)
3214 ANYOF_BITMAP_SET(data->start_class, uc);
3215 else
3216 data->start_class->flags |= ANYOF_UNICODE_ALL;
3217 data->start_class->flags &= ~ANYOF_EOS;
3218 cl_and(data->start_class, and_withp);
3219 }
3220 flags &= ~SCF_DO_STCLASS;
3221 }
3222 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3223 I32 l = STR_LEN(scan);
3224 UV uc = *((U8*)STRING(scan));
3225
3226 /* Search for fixed substrings supports EXACT only. */
3227 if (flags & SCF_DO_SUBSTR) {
3228 assert(data);
304ee84b 3229 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3230 }
3231 if (UTF) {
3232 const U8 * const s = (U8 *)STRING(scan);
3233 l = utf8_length(s, s + l);
3234 uc = utf8_to_uvchr(s, NULL);
3235 }
3236 min += l;
3237 if (flags & SCF_DO_SUBSTR)
3238 data->pos_min += l;
3239 if (flags & SCF_DO_STCLASS_AND) {
3240 /* Check whether it is compatible with what we know already! */
3241 int compat = 1;
8aa23a47 3242 if (uc >= 0x100 ||
54251c2e
KW
3243 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3244 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3245 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3246 {
8aa23a47 3247 compat = 0;
54251c2e 3248 }
8aa23a47
YO
3249 ANYOF_CLASS_ZERO(data->start_class);
3250 ANYOF_BITMAP_ZERO(data->start_class);
3251 if (compat) {
3252 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3253 data->start_class->flags &= ~ANYOF_EOS;
39065660 3254 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3255 if (OP(scan) == EXACTFL) {
af302e7f
KW
3256 /* XXX This set is probably no longer necessary, and
3257 * probably wrong as LOCALE now is on in the initial
3258 * state */
8aa23a47 3259 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3260 }
3261 else {
3262
54251c2e
KW
3263 /* Also set the other member of the fold pair. In case
3264 * that unicode semantics is called for at runtime, use
3265 * the full latin1 fold. (Can't do this for locale,
3266 * because not known until runtime */
3267 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3268 }
653099ff 3269 }
d18bf9dc
KW
3270 else if (uc >= 0x100) {
3271 int i;
3272 for (i = 0; i < 256; i++){
3273 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3274 ANYOF_BITMAP_SET(data->start_class, i);
3275 }
3276 }
3277 }
8aa23a47
YO
3278 }
3279 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3280 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3281 /* false positive possible if the class is case-folded.
3282 Assume that the locale settings are the same... */
970c8436 3283 if (uc < 0x100) {
1aa99e6b 3284 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3285 if (OP(scan) != EXACTFL) {
3286
3287 /* And set the other member of the fold pair, but
3288 * can't do that in locale because not known until
3289 * run-time */
3290 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3291 PL_fold_latin1[uc]);
970c8436
KW
3292 }
3293 }
653099ff
GS
3294 data->start_class->flags &= ~ANYOF_EOS;
3295 }
8aa23a47 3296 cl_and(data->start_class, and_withp);
653099ff 3297 }
8aa23a47
YO
3298 flags &= ~SCF_DO_STCLASS;
3299 }
e52fc539 3300 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3301 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3302 I32 f = flags, pos_before = 0;
3303 regnode * const oscan = scan;
3304 struct regnode_charclass_class this_class;
3305 struct regnode_charclass_class *oclass = NULL;
3306 I32 next_is_eval = 0;
3307
3308 switch (PL_regkind[OP(scan)]) {
3309 case WHILEM: /* End of (?:...)* . */
3310 scan = NEXTOPER(scan);
3311 goto finish;
3312 case PLUS:
3313 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3314 next = NEXTOPER(scan);
3315 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3316 mincount = 1;
3317 maxcount = REG_INFTY;
3318 next = regnext(scan);
3319 scan = NEXTOPER(scan);
3320 goto do_curly;
3321 }
3322 }
3323 if (flags & SCF_DO_SUBSTR)
3324 data->pos_min++;
3325 min++;
3326 /* Fall through. */
3327 case STAR:
3328 if (flags & SCF_DO_STCLASS) {
3329 mincount = 0;
3330 maxcount = REG_INFTY;
3331 next = regnext(scan);
3332 scan = NEXTOPER(scan);
3333 goto do_curly;
3334 }
3335 is_inf = is_inf_internal = 1;
3336 scan = regnext(scan);
c277df42 3337 if (flags & SCF_DO_SUBSTR) {
304ee84b 3338 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3339 data->longest = &(data->longest_float);
c277df42 3340 }
8aa23a47
YO
3341 goto optimize_curly_tail;
3342 case CURLY:
3343 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3344 && (scan->flags == stopparen))
3345 {
3346 mincount = 1;
3347 maxcount = 1;
3348 } else {
3349 mincount = ARG1(scan);
3350 maxcount = ARG2(scan);
653099ff 3351 }
8aa23a47
YO
3352 next = regnext(scan);
3353 if (OP(scan) == CURLYX) {
3354 I32 lp = (data ? *(data->last_closep) : 0);
3355 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3356 }
8aa23a47
YO
3357 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3358 next_is_eval = (OP(scan) == EVAL);
3359 do_curly:
3360 if (flags & SCF_DO_SUBSTR) {
304ee84b 3361 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3362 pos_before = data->pos_min;
b45f050a 3363 }
8aa23a47
YO
3364 if (data) {
3365 fl = data->flags;
3366 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3367 if (is_inf)
3368 data->flags |= SF_IS_INF;
3369 }
3370 if (flags & SCF_DO_STCLASS) {
e755fd73 3371 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3372 oclass = data->start_class;
3373 data->start_class = &this_class;
3374 f |= SCF_DO_STCLASS_AND;
3375 f &= ~SCF_DO_STCLASS_OR;
3376 }
779bcb7d
NC
3377 /* Exclude from super-linear cache processing any {n,m}
3378 regops for which the combination of input pos and regex
3379 pos is not enough information to determine if a match
3380 will be possible.
3381
3382 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3383 regex pos at the \s*, the prospects for a match depend not
3384 only on the input position but also on how many (bar\s*)
3385 repeats into the {4,8} we are. */
3386 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3387 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3388
8aa23a47
YO
3389 /* This will finish on WHILEM, setting scan, or on NULL: */
3390 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3391 last, data, stopparen, recursed, NULL,
3392 (mincount == 0
3393 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3394
8aa23a47
YO
3395 if (flags & SCF_DO_STCLASS)
3396 data->start_class = oclass;
3397 if (mincount == 0 || minnext == 0) {
3398 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3399 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3400 }
3401 else if (flags & SCF_DO_STCLASS_AND) {
3402 /* Switch to OR mode: cache the old value of
3403 * data->start_class */
3404 INIT_AND_WITHP;
3405 StructCopy(data->start_class, and_withp,
3406 struct regnode_charclass_class);
3407 flags &= ~SCF_DO_STCLASS_AND;
3408 StructCopy(&this_class, data->start_class,
3409 struct regnode_charclass_class);
3410 flags |= SCF_DO_STCLASS_OR;
3411 data->start_class->flags |= ANYOF_EOS;
3412 }
3413 } else { /* Non-zero len */
3414 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3415 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3416 cl_and(data->start_class, and_withp);
3417 }
3418 else if (flags & SCF_DO_STCLASS_AND)
3419 cl_and(data->start_class, &this_class);
3420 flags &= ~SCF_DO_STCLASS;
3421 }
3422 if (!scan) /* It was not CURLYX, but CURLY. */
3423 scan = next;
3424 if ( /* ? quantifier ok, except for (?{ ... }) */
3425 (next_is_eval || !(mincount == 0 && maxcount == 1))
3426 && (minnext == 0) && (deltanext == 0)
3427 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3428 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3429 {
668c081a
NC
3430 ckWARNreg(RExC_parse,
3431 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3432 }
3433
3434 min += minnext * mincount;
3435 is_inf_internal |= ((maxcount == REG_INFTY
3436 && (minnext + deltanext) > 0)
3437 || deltanext == I32_MAX);
3438 is_inf |= is_inf_internal;
3439 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3440
3441 /* Try powerful optimization CURLYX => CURLYN. */
3442 if ( OP(oscan) == CURLYX && data
3443 && data->flags & SF_IN_PAR
3444 && !(data->flags & SF_HAS_EVAL)
3445 && !deltanext && minnext == 1 ) {
3446 /* Try to optimize to CURLYN. */
3447 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3448 regnode * const nxt1 = nxt;
497b47a8 3449#ifdef DEBUGGING
8aa23a47 3450 regnode *nxt2;
497b47a8 3451#endif
c277df42 3452
8aa23a47
YO
3453 /* Skip open. */
3454 nxt = regnext(nxt);
e52fc539 3455 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3456 && !(PL_regkind[OP(nxt)] == EXACT
3457 && STR_LEN(nxt) == 1))
3458 goto nogo;
497b47a8 3459#ifdef DEBUGGING
8aa23a47 3460 nxt2 = nxt;
497b47a8 3461#endif
8aa23a47
YO
3462 nxt = regnext(nxt);
3463 if (OP(nxt) != CLOSE)
3464 goto nogo;
3465 if (RExC_open_parens) {
3466 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3467 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3468 }
3469 /* Now we know that nxt2 is the only contents: */
3470 oscan->flags = (U8)ARG(nxt);
3471 OP(oscan) = CURLYN;
3472 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3473
c277df42 3474#ifdef DEBUGGING
8aa23a47 3475 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3476 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3477 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3478 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3479 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3480 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3481#endif
8aa23a47
YO
3482 }
3483 nogo:
3484
3485 /* Try optimization CURLYX => CURLYM. */
3486 if ( OP(oscan) == CURLYX && data
3487 && !(data->flags & SF_HAS_PAR)
3488 && !(data->flags & SF_HAS_EVAL)
3489 && !deltanext /* atom is fixed width */
3490 && minnext != 0 /* CURLYM can't handle zero width */
3491 ) {
3492 /* XXXX How to optimize if data == 0? */
3493 /* Optimize to a simpler form. */
3494 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3495 regnode *nxt2;
3496
3497 OP(oscan) = CURLYM;
3498 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3499 && (OP(nxt2) != WHILEM))
3500 nxt = nxt2;
3501 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3502 /* Need to optimize away parenths. */
b3c0965f 3503 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3504 /* Set the parenth number. */
3505 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3506
8aa23a47
YO
3507 oscan->flags = (U8)ARG(nxt);
3508 if (RExC_open_parens) {
3509 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3510 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3511 }
8aa23a47
YO
3512 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3513 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3514
c277df42 3515#ifdef DEBUGGING
8aa23a47
YO
3516 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3517 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3518 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3519 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3520#endif
c277df42 3521#if 0
8aa23a47
YO
3522 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3523 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3524 if (nnxt == nxt) {
3525 if (reg_off_by_arg[OP(nxt1)])
3526 ARG_SET(nxt1, nxt2 - nxt1);
3527 else if (nxt2 - nxt1 < U16_MAX)
3528 NEXT_OFF(nxt1) = nxt2 - nxt1;
3529 else
3530 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3531 }
8aa23a47 3532 nxt1 = nnxt;
c277df42 3533 }
5d1c421c 3534#endif
8aa23a47
YO
3535 /* Optimize again: */
3536 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3537 NULL, stopparen, recursed, NULL, 0,depth+1);
3538 }
3539 else
3540 oscan->flags = 0;
3541 }
3542 else if ((OP(oscan) == CURLYX)
3543 && (flags & SCF_WHILEM_VISITED_POS)
3544 /* See the comment on a similar expression above.
3b753521 3545 However, this time it's not a subexpression
8aa23a47
YO
3546 we care about, but the expression itself. */
3547 && (maxcount == REG_INFTY)
3548 && data && ++data->whilem_c < 16) {
3549 /* This stays as CURLYX, we can put the count/of pair. */
3550 /* Find WHILEM (as in regexec.c) */
3551 regnode *nxt = oscan + NEXT_OFF(oscan);
3552
3553 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3554 nxt += ARG(nxt);
3555 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3556 | (RExC_whilem_seen << 4)); /* On WHILEM */
3557 }
3558 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3559 pars++;
3560 if (flags & SCF_DO_SUBSTR) {
3561 SV *last_str = NULL;
3562 int counted = mincount != 0;
a0ed51b3 3563
8aa23a47
YO
3564 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3565#if defined(SPARC64_GCC_WORKAROUND)
3566 I32 b = 0;
3567 STRLEN l = 0;
3568 const char *s = NULL;
3569 I32 old = 0;
b515a41d 3570
8aa23a47
YO
3571 if (pos_before >= data->last_start_min)
3572 b = pos_before;
3573 else
3574 b = data->last_start_min;
b515a41d 3575
8aa23a47
YO
3576 l = 0;
3577 s = SvPV_const(data->last_found, l);
3578 old = b - data->last_start_min;
3579
3580#else
3581 I32 b = pos_before >= data->last_start_min
3582 ? pos_before : data->last_start_min;
3583 STRLEN l;
3584 const char * const s = SvPV_const(data->last_found, l);
3585 I32 old = b - data->last_start_min;
3586#endif
3587
3588 if (UTF)
3589 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3590 l -= old;
3591 /* Get the added string: */
740cce10 3592 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3593 if (deltanext == 0 && pos_before == b) {
3594 /* What was added is a constant string */
3595 if (mincount > 1) {
3596 SvGROW(last_str, (mincount * l) + 1);
3597 repeatcpy(SvPVX(last_str) + l,
3598 SvPVX_const(last_str), l, mincount - 1);
3599 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3600 /* Add additional parts. */
3601 SvCUR_set(data->last_found,
3602 SvCUR(data->last_found) - l);
3603 sv_catsv(data->last_found, last_str);
3604 {
3605 SV * sv = data->last_found;
3606 MAGIC *mg =
3607 SvUTF8(sv) && SvMAGICAL(sv) ?
3608 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3609 if (mg && mg->mg_len >= 0)
bd94e887 3610 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3611 }
8aa23a47 3612 data->last_end += l * (mincount - 1);
b515a41d 3613 }
8aa23a47
YO
3614 } else {
3615 /* start offset must point into the last copy */
3616 data->last_start_min += minnext * (mincount - 1);
3617 data->last_start_max += is_inf ? I32_MAX
3618 : (maxcount - 1) * (minnext + data->pos_delta);
3619 }
c277df42 3620 }
8aa23a47
YO
3621 /* It is counted once already... */
3622 data->pos_min += minnext * (mincount - counted);
3623 data->pos_delta += - counted * deltanext +
3624 (minnext + deltanext) * maxcount - minnext * mincount;
3625 if (mincount != maxcount) {
3626 /* Cannot extend fixed substrings found inside
3627 the group. */
304ee84b 3628 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3629 if (mincount && last_str) {
3630 SV * const sv = data->last_found;
3631 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3632 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3633
3634 if (mg)
3635 mg->mg_len = -1;
3636 sv_setsv(sv, last_str);
3637 data->last_end = data->pos_min;
3638 data->last_start_min =
3639 data->pos_min - CHR_SVLEN(last_str);
3640 data->last_start_max = is_inf
3641 ? I32_MAX
3642 : data->pos_min + data->pos_delta
3643 - CHR_SVLEN(last_str);
3644 }
3645 data->longest = &(data->longest_float);
3646 }
3647 SvREFCNT_dec(last_str);
c277df42 3648 }
8aa23a47
YO
3649 if (data && (fl & SF_HAS_EVAL))
3650 data->flags |= SF_HAS_EVAL;
3651 optimize_curly_tail:
3652 if (OP(oscan) != CURLYX) {
3653 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3654 && NEXT_OFF(next))
3655 NEXT_OFF(oscan) += NEXT_OFF(next);
3656 }
3657 continue;
f56b6394 3658 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3659 if (flags & SCF_DO_SUBSTR) {
304ee84b 3660 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3661 data->longest = &(data->longest_float);
3662 }
3663 is_inf = is_inf_internal = 1;
3664 if (flags & SCF_DO_STCLASS_OR)
3fffb88a 3665 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3666 flags &= ~SCF_DO_STCLASS;
3667 break;
c277df42 3668 }
8aa23a47 3669 }
e1d1eefb
YO
3670 else if (OP(scan) == LNBREAK) {
3671 if (flags & SCF_DO_STCLASS) {
3672 int value = 0;
3673 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3674 if (flags & SCF_DO_STCLASS_AND) {
3675 for (value = 0; value < 256; value++)
e64b1bd1 3676 if (!is_VERTWS_cp(value))
b9a59e08
KW
3677 ANYOF_BITMAP_CLEAR(data->start_class, value);
3678 }