This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp: Add warning if tries to use \p in locale.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
b57e4118 144 I32 in_lookbehind;
830247a4
IZ
145#if ADD_TO_REGEXEC
146 char *starttry; /* -Dr: where regtry was called. */
147#define RExC_starttry (pRExC_state->starttry)
148#endif
3dab1dad 149#ifdef DEBUGGING
be8e71aa 150 const char *lastparse;
3dab1dad 151 I32 lastnum;
1f1031fe 152 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
153#define RExC_lastparse (pRExC_state->lastparse)
154#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 155#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 156#endif
830247a4
IZ
157} RExC_state_t;
158
e2509266 159#define RExC_flags (pRExC_state->flags)
830247a4 160#define RExC_precomp (pRExC_state->precomp)
288b8c02 161#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 162#define RExC_rx (pRExC_state->rx)
f8fc2ecf 163#define RExC_rxi (pRExC_state->rxi)
fac92740 164#define RExC_start (pRExC_state->start)
830247a4
IZ
165#define RExC_end (pRExC_state->end)
166#define RExC_parse (pRExC_state->parse)
167#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
168#ifdef RE_TRACK_PATTERN_OFFSETS
169#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
170#endif
830247a4 171#define RExC_emit (pRExC_state->emit)
fac92740 172#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 173#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
174#define RExC_naughty (pRExC_state->naughty)
175#define RExC_sawback (pRExC_state->sawback)
176#define RExC_seen (pRExC_state->seen)
177#define RExC_size (pRExC_state->size)
178#define RExC_npar (pRExC_state->npar)
e2e6a0f1 179#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
180#define RExC_extralen (pRExC_state->extralen)
181#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
182#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 183#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 184#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 185#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
186#define RExC_open_parens (pRExC_state->open_parens)
187#define RExC_close_parens (pRExC_state->close_parens)
188#define RExC_opend (pRExC_state->opend)
81714fb9 189#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
190#define RExC_recurse (pRExC_state->recurse)
191#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 192#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
830247a4 193
cde0cee5 194
a687059c
LW
195#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
196#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
197 ((*s) == '{' && regcurly(s)))
a687059c 198
35c8bce7
LW
199#ifdef SPSTART
200#undef SPSTART /* dratted cpp namespace... */
201#endif
a687059c
LW
202/*
203 * Flags to be passed up and down.
204 */
a687059c 205#define WORST 0 /* Worst case. */
a3b492c3 206#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
207
208/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 209 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 210#define SIMPLE 0x02
a3b492c3
YO
211#define SPSTART 0x04 /* Starts with * or +. */
212#define TRYAGAIN 0x08 /* Weeded out a declaration. */
213#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 214
3dab1dad
YO
215#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
216
07be1b83
YO
217/* whether trie related optimizations are enabled */
218#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
219#define TRIE_STUDY_OPT
786e8c11 220#define FULL_TRIE_STUDY
07be1b83
YO
221#define TRIE_STCLASS
222#endif
1de06328
YO
223
224
40d049e4
YO
225
226#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
227#define PBITVAL(paren) (1 << ((paren) & 7))
228#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
229#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
230#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
231
bbd61b5f
KW
232/* If not already in utf8, do a longjmp back to the beginning */
233#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
234#define REQUIRE_UTF8 STMT_START { \
235 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
236 } STMT_END
40d049e4 237
1de06328
YO
238/* About scan_data_t.
239
240 During optimisation we recurse through the regexp program performing
241 various inplace (keyhole style) optimisations. In addition study_chunk
242 and scan_commit populate this data structure with information about
243 what strings MUST appear in the pattern. We look for the longest
3b753521 244 string that must appear at a fixed location, and we look for the
1de06328
YO
245 longest string that may appear at a floating location. So for instance
246 in the pattern:
247
248 /FOO[xX]A.*B[xX]BAR/
249
250 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
251 strings (because they follow a .* construct). study_chunk will identify
252 both FOO and BAR as being the longest fixed and floating strings respectively.
253
254 The strings can be composites, for instance
255
256 /(f)(o)(o)/
257
258 will result in a composite fixed substring 'foo'.
259
260 For each string some basic information is maintained:
261
262 - offset or min_offset
263 This is the position the string must appear at, or not before.
264 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
265 characters must match before the string we are searching for.
266 Likewise when combined with minlenp and the length of the string it
1de06328
YO
267 tells us how many characters must appear after the string we have
268 found.
269
270 - max_offset
271 Only used for floating strings. This is the rightmost point that
3b753521 272 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
273 string can occur infinitely far to the right.
274
275 - minlenp
276 A pointer to the minimum length of the pattern that the string
277 was found inside. This is important as in the case of positive
278 lookahead or positive lookbehind we can have multiple patterns
279 involved. Consider
280
281 /(?=FOO).*F/
282
283 The minimum length of the pattern overall is 3, the minimum length
284 of the lookahead part is 3, but the minimum length of the part that
285 will actually match is 1. So 'FOO's minimum length is 3, but the
286 minimum length for the F is 1. This is important as the minimum length
287 is used to determine offsets in front of and behind the string being
288 looked for. Since strings can be composites this is the length of the
486ec47a 289 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
290 the length is calculated by study_chunk, so that the minimum lengths
291 are not known until the full pattern has been compiled, thus the
292 pointer to the value.
293
294 - lookbehind
295
296 In the case of lookbehind the string being searched for can be
297 offset past the start point of the final matching string.
298 If this value was just blithely removed from the min_offset it would
299 invalidate some of the calculations for how many chars must match
300 before or after (as they are derived from min_offset and minlen and
301 the length of the string being searched for).
302 When the final pattern is compiled and the data is moved from the
303 scan_data_t structure into the regexp structure the information
304 about lookbehind is factored in, with the information that would
305 have been lost precalculated in the end_shift field for the
306 associated string.
307
308 The fields pos_min and pos_delta are used to store the minimum offset
309 and the delta to the maximum offset at the current point in the pattern.
310
311*/
2c2d71f5
JH
312
313typedef struct scan_data_t {
1de06328
YO
314 /*I32 len_min; unused */
315 /*I32 len_delta; unused */
2c2d71f5
JH
316 I32 pos_min;
317 I32 pos_delta;
318 SV *last_found;
1de06328 319 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
320 I32 last_start_min;
321 I32 last_start_max;
1de06328
YO
322 SV **longest; /* Either &l_fixed, or &l_float. */
323 SV *longest_fixed; /* longest fixed string found in pattern */
324 I32 offset_fixed; /* offset where it starts */
486ec47a 325 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
326 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
327 SV *longest_float; /* longest floating string found in pattern */
328 I32 offset_float_min; /* earliest point in string it can appear */
329 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 330 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 331 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
332 I32 flags;
333 I32 whilem_c;
cb434fcc 334 I32 *last_closep;
653099ff 335 struct regnode_charclass_class *start_class;
2c2d71f5
JH
336} scan_data_t;
337
a687059c 338/*
e50aee73 339 * Forward declarations for pregcomp()'s friends.
a687059c 340 */
a0d0e21e 341
27da23d5 342static const scan_data_t zero_scan_data =
1de06328 343 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
344
345#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
346#define SF_BEFORE_SEOL 0x0001
347#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
348#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
349#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
350
09b7f37c
CB
351#ifdef NO_UNARY_PLUS
352# define SF_FIX_SHIFT_EOL (0+2)
353# define SF_FL_SHIFT_EOL (0+4)
354#else
355# define SF_FIX_SHIFT_EOL (+2)
356# define SF_FL_SHIFT_EOL (+4)
357#endif
c277df42
IZ
358
359#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
360#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
361
362#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
363#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
364#define SF_IS_INF 0x0040
365#define SF_HAS_PAR 0x0080
366#define SF_IN_PAR 0x0100
367#define SF_HAS_EVAL 0x0200
368#define SCF_DO_SUBSTR 0x0400
653099ff
GS
369#define SCF_DO_STCLASS_AND 0x0800
370#define SCF_DO_STCLASS_OR 0x1000
371#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 372#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 373
786e8c11 374#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 375#define SCF_SEEN_ACCEPT 0x8000
07be1b83 376
43fead97 377#define UTF cBOOL(RExC_utf8)
a62b1201
KW
378#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
379#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
380#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
381#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
382#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 383
43fead97 384#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 385
ffc61ed2 386#define OOB_UNICODE 12345678
93733859 387#define OOB_NAMEDCLASS -1
b8c5462f 388
a0ed51b3
LW
389#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
390#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
391
8615cb43 392
b45f050a
JF
393/* length of regex to show in messages that don't mark a position within */
394#define RegexLengthToShowInErrorMessages 127
395
396/*
397 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
398 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
399 * op/pragma/warn/regcomp.
400 */
7253e4e3
RK
401#define MARKER1 "<-- HERE" /* marker as it appears in the description */
402#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 403
7253e4e3 404#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
408 * arg. Show regex, up to a maximum length. If it's too long, chop and add
409 * "...".
410 */
58e23c8d 411#define _FAIL(code) STMT_START { \
bfed75c6 412 const char *ellipses = ""; \
ccb2c380
MP
413 IV len = RExC_end - RExC_precomp; \
414 \
415 if (!SIZE_ONLY) \
288b8c02 416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
417 if (len > RegexLengthToShowInErrorMessages) { \
418 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
419 len = RegexLengthToShowInErrorMessages - 10; \
420 ellipses = "..."; \
421 } \
58e23c8d 422 code; \
ccb2c380 423} STMT_END
8615cb43 424
58e23c8d
YO
425#define FAIL(msg) _FAIL( \
426 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
427 msg, (int)len, RExC_precomp, ellipses))
428
429#define FAIL2(msg,arg) _FAIL( \
430 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
431 arg, (int)len, RExC_precomp, ellipses))
432
b45f050a 433/*
b45f050a
JF
434 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
435 */
ccb2c380 436#define Simple_vFAIL(m) STMT_START { \
a28509cc 437 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
438 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
439 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
440} STMT_END
b45f050a
JF
441
442/*
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
444 */
ccb2c380
MP
445#define vFAIL(m) STMT_START { \
446 if (!SIZE_ONLY) \
288b8c02 447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
448 Simple_vFAIL(m); \
449} STMT_END
b45f050a
JF
450
451/*
452 * Like Simple_vFAIL(), but accepts two arguments.
453 */
ccb2c380 454#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a
JF
459
460/*
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
462 */
ccb2c380
MP
463#define vFAIL2(m,a1) STMT_START { \
464 if (!SIZE_ONLY) \
288b8c02 465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
466 Simple_vFAIL2(m, a1); \
467} STMT_END
b45f050a
JF
468
469
470/*
471 * Like Simple_vFAIL(), but accepts three arguments.
472 */
ccb2c380 473#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 474 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
475 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
476 (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
b45f050a
JF
478
479/*
480 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
481 */
ccb2c380
MP
482#define vFAIL3(m,a1,a2) STMT_START { \
483 if (!SIZE_ONLY) \
288b8c02 484 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
485 Simple_vFAIL3(m, a1, a2); \
486} STMT_END
b45f050a
JF
487
488/*
489 * Like Simple_vFAIL(), but accepts four arguments.
490 */
ccb2c380 491#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 492 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
493 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
494 (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
b45f050a 496
668c081a 497#define ckWARNreg(loc,m) STMT_START { \
a28509cc 498 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
499 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
501} STMT_END
502
668c081a 503#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 504 const IV offset = loc - RExC_precomp; \
d1d15184 505 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
506 m REPORT_LOCATION, \
507 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
508} STMT_END
509
668c081a 510#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 511 const IV offset = loc - RExC_precomp; \
668c081a 512 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
513 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
514} STMT_END
515
516#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 517 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
518 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
519 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
520} STMT_END
521
668c081a
NC
522#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
523 const IV offset = loc - RExC_precomp; \
524 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
525 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
526} STMT_END
527
ccb2c380 528#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 529 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
530 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
531 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
532} STMT_END
533
668c081a
NC
534#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
535 const IV offset = loc - RExC_precomp; \
536 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
537 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
538} STMT_END
539
ccb2c380 540#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 541 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
542 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
543 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
544} STMT_END
9d1d55b5 545
8615cb43 546
cd439c50 547/* Allow for side effects in s */
ccb2c380
MP
548#define REGC(c,s) STMT_START { \
549 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
550} STMT_END
cd439c50 551
fac92740
MJD
552/* Macros for recording node offsets. 20001227 mjd@plover.com
553 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
554 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
555 * Element 0 holds the number n.
07be1b83 556 * Position is 1 indexed.
fac92740 557 */
7122b237
YO
558#ifndef RE_TRACK_PATTERN_OFFSETS
559#define Set_Node_Offset_To_R(node,byte)
560#define Set_Node_Offset(node,byte)
561#define Set_Cur_Node_Offset
562#define Set_Node_Length_To_R(node,len)
563#define Set_Node_Length(node,len)
564#define Set_Node_Cur_Length(node)
565#define Node_Offset(n)
566#define Node_Length(n)
567#define Set_Node_Offset_Length(node,offset,len)
568#define ProgLen(ri) ri->u.proglen
569#define SetProgLen(ri,x) ri->u.proglen = x
570#else
571#define ProgLen(ri) ri->u.offsets[0]
572#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
573#define Set_Node_Offset_To_R(node,byte) STMT_START { \
574 if (! SIZE_ONLY) { \
575 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 576 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 577 if((node) < 0) { \
551405c4 578 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
579 } else { \
580 RExC_offsets[2*(node)-1] = (byte); \
581 } \
582 } \
583} STMT_END
584
585#define Set_Node_Offset(node,byte) \
586 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
587#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
588
589#define Set_Node_Length_To_R(node,len) STMT_START { \
590 if (! SIZE_ONLY) { \
591 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 592 __LINE__, (int)(node), (int)(len))); \
ccb2c380 593 if((node) < 0) { \
551405c4 594 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
595 } else { \
596 RExC_offsets[2*(node)] = (len); \
597 } \
598 } \
599} STMT_END
600
601#define Set_Node_Length(node,len) \
602 Set_Node_Length_To_R((node)-RExC_emit_start, len)
603#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
604#define Set_Node_Cur_Length(node) \
605 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
606
607/* Get offsets and lengths */
608#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
609#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
610
07be1b83
YO
611#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
612 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
613 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
614} STMT_END
7122b237 615#endif
07be1b83
YO
616
617#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
618#define EXPERIMENTAL_INPLACESCAN
f427392e 619#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 620
304ee84b
YO
621#define DEBUG_STUDYDATA(str,data,depth) \
622DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 623 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
624 "%*s" str "Pos:%"IVdf"/%"IVdf \
625 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
626 (int)(depth)*2, "", \
627 (IV)((data)->pos_min), \
628 (IV)((data)->pos_delta), \
304ee84b 629 (UV)((data)->flags), \
1de06328 630 (IV)((data)->whilem_c), \
304ee84b
YO
631 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
632 is_inf ? "INF " : "" \
1de06328
YO
633 ); \
634 if ((data)->last_found) \
635 PerlIO_printf(Perl_debug_log, \
636 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
637 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
638 SvPVX_const((data)->last_found), \
639 (IV)((data)->last_end), \
640 (IV)((data)->last_start_min), \
641 (IV)((data)->last_start_max), \
642 ((data)->longest && \
643 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
644 SvPVX_const((data)->longest_fixed), \
645 (IV)((data)->offset_fixed), \
646 ((data)->longest && \
647 (data)->longest==&((data)->longest_float)) ? "*" : "", \
648 SvPVX_const((data)->longest_float), \
649 (IV)((data)->offset_float_min), \
650 (IV)((data)->offset_float_max) \
651 ); \
652 PerlIO_printf(Perl_debug_log,"\n"); \
653});
654
acfe0abc 655static void clear_re(pTHX_ void *r);
4327152a 656
653099ff 657/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 658 Update the longest found anchored substring and the longest found
653099ff
GS
659 floating substrings if needed. */
660
4327152a 661STATIC void
304ee84b 662S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 663{
e1ec3a88
AL
664 const STRLEN l = CHR_SVLEN(data->last_found);
665 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 666 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 667
7918f24d
NC
668 PERL_ARGS_ASSERT_SCAN_COMMIT;
669
c277df42 670 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 671 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
672 if (*data->longest == data->longest_fixed) {
673 data->offset_fixed = l ? data->last_start_min : data->pos_min;
674 if (data->flags & SF_BEFORE_EOL)
b81d288d 675 data->flags
c277df42
IZ
676 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
677 else
678 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
679 data->minlen_fixed=minlenp;
680 data->lookbehind_fixed=0;
a0ed51b3 681 }
304ee84b 682 else { /* *data->longest == data->longest_float */
c277df42 683 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
684 data->offset_float_max = (l
685 ? data->last_start_max
c277df42 686 : data->pos_min + data->pos_delta);
304ee84b 687 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 688 data->offset_float_max = I32_MAX;
c277df42 689 if (data->flags & SF_BEFORE_EOL)
b81d288d 690 data->flags
c277df42
IZ
691 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
692 else
693 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
694 data->minlen_float=minlenp;
695 data->lookbehind_float=0;
c277df42
IZ
696 }
697 }
698 SvCUR_set(data->last_found, 0);
0eda9292 699 {
a28509cc 700 SV * const sv = data->last_found;
097eb12c
AL
701 if (SvUTF8(sv) && SvMAGICAL(sv)) {
702 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
703 if (mg)
704 mg->mg_len = 0;
705 }
0eda9292 706 }
c277df42
IZ
707 data->last_end = -1;
708 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 709 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
710}
711
653099ff
GS
712/* Can match anything (initialization) */
713STATIC void
097eb12c 714S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 715{
7918f24d
NC
716 PERL_ARGS_ASSERT_CL_ANYTHING;
717
653099ff 718 ANYOF_CLASS_ZERO(cl);
f8bef550 719 ANYOF_BITMAP_SETALL(cl);
11454c59 720 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
653099ff
GS
721 if (LOC)
722 cl->flags |= ANYOF_LOCALE;
723}
724
725/* Can match anything (initialization) */
726STATIC int
5f66b61c 727S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
728{
729 int value;
730
7918f24d
NC
731 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
732
aaa51d5e 733 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
734 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
735 return 1;
1aa99e6b
IH
736 if (!(cl->flags & ANYOF_UNICODE_ALL))
737 return 0;
10edeb5d 738 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 739 return 0;
653099ff
GS
740 return 1;
741}
742
743/* Can match anything (initialization) */
744STATIC void
097eb12c 745S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 746{
7918f24d
NC
747 PERL_ARGS_ASSERT_CL_INIT;
748
8ecf7187 749 Zero(cl, 1, struct regnode_charclass_class);
653099ff 750 cl->type = ANYOF;
830247a4 751 cl_anything(pRExC_state, cl);
653099ff
GS
752}
753
754STATIC void
097eb12c 755S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 756{
7918f24d
NC
757 PERL_ARGS_ASSERT_CL_INIT_ZERO;
758
8ecf7187 759 Zero(cl, 1, struct regnode_charclass_class);
653099ff 760 cl->type = ANYOF;
830247a4 761 cl_anything(pRExC_state, cl);
653099ff
GS
762 if (LOC)
763 cl->flags |= ANYOF_LOCALE;
764}
765
766/* 'And' a given class with another one. Can create false positives */
767/* We assume that cl is not inverted */
768STATIC void
5f66b61c 769S_cl_and(struct regnode_charclass_class *cl,
a28509cc 770 const struct regnode_charclass_class *and_with)
653099ff 771{
7918f24d 772 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
773
774 assert(and_with->type == ANYOF);
1e6ade67
KW
775
776 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
777 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 778 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
779 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
780 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
781 int i;
782
783 if (and_with->flags & ANYOF_INVERT)
784 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
785 cl->bitmap[i] &= ~and_with->bitmap[i];
786 else
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] &= and_with->bitmap[i];
789 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
790 if (!(and_with->flags & ANYOF_EOS))
791 cl->flags &= ~ANYOF_EOS;
1aa99e6b 792
39065660
KW
793 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
794 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
11454c59
KW
795 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
796 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
8951c461 797
3ff7ceb3 798 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
14ebb1a2 799 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b 800 cl->flags &= ~ANYOF_UNICODE_ALL;
ef87b810
KW
801 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
802 only the one(s)
803 actually set */
1aa99e6b
IH
804 ARG_SET(cl, ARG(and_with));
805 }
14ebb1a2
JH
806 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
807 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 808 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 809 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 810 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 811 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
812}
813
814/* 'OR' a given class with another one. Can create false positives */
815/* We assume that cl is not inverted */
816STATIC void
097eb12c 817S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 818{
7918f24d
NC
819 PERL_ARGS_ASSERT_CL_OR;
820
653099ff
GS
821 if (or_with->flags & ANYOF_INVERT) {
822 /* We do not use
823 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
824 * <= (B1 | !B2) | (CL1 | !CL2)
825 * which is wasteful if CL2 is small, but we ignore CL2:
826 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
827 * XXXX Can we handle case-fold? Unclear:
828 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
829 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
830 */
831 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
832 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
833 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
834 int i;
835
836 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
837 cl->bitmap[i] |= ~or_with->bitmap[i];
838 } /* XXXX: logic is complicated otherwise */
839 else {
830247a4 840 cl_anything(pRExC_state, cl);
653099ff
GS
841 }
842 } else {
843 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
844 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
845 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
846 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
847 int i;
848
849 /* OR char bitmap and class bitmap separately */
850 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
851 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 852 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
853 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
854 cl->classflags[i] |= or_with->classflags[i];
855 cl->flags |= ANYOF_CLASS;
856 }
857 }
858 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 859 cl_anything(pRExC_state, cl);
653099ff
GS
860 }
861 }
862 if (or_with->flags & ANYOF_EOS)
863 cl->flags |= ANYOF_EOS;
11454c59
KW
864 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
865 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
1aa99e6b 866
39065660
KW
867 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
868 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
8951c461 869
9826f543
KW
870 /* If both nodes match something outside the bitmap, but what they match
871 * outside is not the same pointer, and hence not easily compared, give up
872 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 873 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
874 ARG(cl) != ARG(or_with)) {
875 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 876 }
9826f543 877
1aa99e6b
IH
878 if (or_with->flags & ANYOF_UNICODE_ALL) {
879 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 880 }
653099ff
GS
881}
882
a3621e74
YO
883#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
884#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
885#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
886#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
887
3dab1dad
YO
888
889#ifdef DEBUGGING
07be1b83 890/*
2b8b4781
NC
891 dump_trie(trie,widecharmap,revcharmap)
892 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
893 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
894
895 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
896 The _interim_ variants are used for debugging the interim
897 tables that are used to generate the final compressed
898 representation which is what dump_trie expects.
899
486ec47a 900 Part of the reason for their existence is to provide a form
3dab1dad 901 of documentation as to how the different representations function.
07be1b83
YO
902
903*/
3dab1dad
YO
904
905/*
3dab1dad
YO
906 Dumps the final compressed table form of the trie to Perl_debug_log.
907 Used for debugging make_trie().
908*/
b9a59e08 909
3dab1dad 910STATIC void
2b8b4781
NC
911S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
912 AV *revcharmap, U32 depth)
3dab1dad
YO
913{
914 U32 state;
ab3bbdeb 915 SV *sv=sv_newmortal();
55eed653 916 int colwidth= widecharmap ? 6 : 4;
2e64971a 917 U16 word;
3dab1dad
YO
918 GET_RE_DEBUG_FLAGS_DECL;
919
7918f24d 920 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 921
3dab1dad
YO
922 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
923 (int)depth * 2 + 2,"",
924 "Match","Base","Ofs" );
925
926 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 927 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 928 if ( tmp ) {
ab3bbdeb
YO
929 PerlIO_printf( Perl_debug_log, "%*s",
930 colwidth,
ddc5bc0f 931 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
932 PL_colors[0], PL_colors[1],
933 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
934 PERL_PV_ESCAPE_FIRSTCHAR
935 )
936 );
3dab1dad
YO
937 }
938 }
939 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
940 (int)depth * 2 + 2,"");
941
942 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 943 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
944 PerlIO_printf( Perl_debug_log, "\n");
945
1e2e3d02 946 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 947 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
948
949 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
950
951 if ( trie->states[ state ].wordnum ) {
952 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
953 } else {
954 PerlIO_printf( Perl_debug_log, "%6s", "" );
955 }
956
957 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
958
959 if ( base ) {
960 U32 ofs = 0;
961
962 while( ( base + ofs < trie->uniquecharcount ) ||
963 ( base + ofs - trie->uniquecharcount < trie->lasttrans
964 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
965 ofs++;
966
967 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
968
969 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
970 if ( ( base + ofs >= trie->uniquecharcount ) &&
971 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
972 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
973 {
ab3bbdeb
YO
974 PerlIO_printf( Perl_debug_log, "%*"UVXf,
975 colwidth,
3dab1dad
YO
976 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
977 } else {
ab3bbdeb 978 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
979 }
980 }
981
982 PerlIO_printf( Perl_debug_log, "]");
983
984 }
985 PerlIO_printf( Perl_debug_log, "\n" );
986 }
2e64971a
DM
987 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
988 for (word=1; word <= trie->wordcount; word++) {
989 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
990 (int)word, (int)(trie->wordinfo[word].prev),
991 (int)(trie->wordinfo[word].len));
992 }
993 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
994}
995/*
3dab1dad
YO
996 Dumps a fully constructed but uncompressed trie in list form.
997 List tries normally only are used for construction when the number of
998 possible chars (trie->uniquecharcount) is very high.
999 Used for debugging make_trie().
1000*/
1001STATIC void
55eed653 1002S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1003 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1004 U32 depth)
3dab1dad
YO
1005{
1006 U32 state;
ab3bbdeb 1007 SV *sv=sv_newmortal();
55eed653 1008 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1009 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1010
1011 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1012
3dab1dad 1013 /* print out the table precompression. */
ab3bbdeb
YO
1014 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1015 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1016 "------:-----+-----------------\n" );
3dab1dad
YO
1017
1018 for( state=1 ; state < next_alloc ; state ++ ) {
1019 U16 charid;
1020
ab3bbdeb 1021 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1022 (int)depth * 2 + 2,"", (UV)state );
1023 if ( ! trie->states[ state ].wordnum ) {
1024 PerlIO_printf( Perl_debug_log, "%5s| ","");
1025 } else {
1026 PerlIO_printf( Perl_debug_log, "W%4x| ",
1027 trie->states[ state ].wordnum
1028 );
1029 }
1030 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1031 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1032 if ( tmp ) {
1033 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1034 colwidth,
ddc5bc0f 1035 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1036 PL_colors[0], PL_colors[1],
1037 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1038 PERL_PV_ESCAPE_FIRSTCHAR
1039 ) ,
1e2e3d02
YO
1040 TRIE_LIST_ITEM(state,charid).forid,
1041 (UV)TRIE_LIST_ITEM(state,charid).newstate
1042 );
1043 if (!(charid % 10))
664e119d
RGS
1044 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1045 (int)((depth * 2) + 14), "");
1e2e3d02 1046 }
ab3bbdeb
YO
1047 }
1048 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1049 }
1050}
1051
1052/*
3dab1dad
YO
1053 Dumps a fully constructed but uncompressed trie in table form.
1054 This is the normal DFA style state transition table, with a few
1055 twists to facilitate compression later.
1056 Used for debugging make_trie().
1057*/
1058STATIC void
55eed653 1059S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1060 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1061 U32 depth)
3dab1dad
YO
1062{
1063 U32 state;
1064 U16 charid;
ab3bbdeb 1065 SV *sv=sv_newmortal();
55eed653 1066 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1067 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1068
1069 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1070
1071 /*
1072 print out the table precompression so that we can do a visual check
1073 that they are identical.
1074 */
1075
1076 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1077
1078 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1079 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1080 if ( tmp ) {
ab3bbdeb
YO
1081 PerlIO_printf( Perl_debug_log, "%*s",
1082 colwidth,
ddc5bc0f 1083 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1084 PL_colors[0], PL_colors[1],
1085 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1086 PERL_PV_ESCAPE_FIRSTCHAR
1087 )
1088 );
3dab1dad
YO
1089 }
1090 }
1091
1092 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1093
1094 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1095 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1096 }
1097
1098 PerlIO_printf( Perl_debug_log, "\n" );
1099
1100 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1101
1102 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1103 (int)depth * 2 + 2,"",
1104 (UV)TRIE_NODENUM( state ) );
1105
1106 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1107 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1108 if (v)
1109 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1110 else
1111 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1112 }
1113 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1114 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1115 } else {
1116 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1117 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1118 }
1119 }
07be1b83 1120}
3dab1dad
YO
1121
1122#endif
1123
2e64971a 1124
786e8c11
YO
1125/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1126 startbranch: the first branch in the whole branch sequence
1127 first : start branch of sequence of branch-exact nodes.
1128 May be the same as startbranch
1129 last : Thing following the last branch.
1130 May be the same as tail.
1131 tail : item following the branch sequence
1132 count : words in the sequence
1133 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1134 depth : indent depth
3dab1dad 1135
786e8c11 1136Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1137
786e8c11
YO
1138A trie is an N'ary tree where the branches are determined by digital
1139decomposition of the key. IE, at the root node you look up the 1st character and
1140follow that branch repeat until you find the end of the branches. Nodes can be
1141marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1142
786e8c11 1143 /he|she|his|hers/
72f13be8 1144
786e8c11
YO
1145would convert into the following structure. Numbers represent states, letters
1146following numbers represent valid transitions on the letter from that state, if
1147the number is in square brackets it represents an accepting state, otherwise it
1148will be in parenthesis.
07be1b83 1149
786e8c11
YO
1150 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1151 | |
1152 | (2)
1153 | |
1154 (1) +-i->(6)-+-s->[7]
1155 |
1156 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1157
786e8c11
YO
1158 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1159
1160This shows that when matching against the string 'hers' we will begin at state 1
1161read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1162then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1163is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1164single traverse. We store a mapping from accepting to state to which word was
1165matched, and then when we have multiple possibilities we try to complete the
1166rest of the regex in the order in which they occured in the alternation.
1167
1168The only prior NFA like behaviour that would be changed by the TRIE support is
1169the silent ignoring of duplicate alternations which are of the form:
1170
1171 / (DUPE|DUPE) X? (?{ ... }) Y /x
1172
4b714af6 1173Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1174and without the optimisation. With the optimisations dupes will be silently
486ec47a 1175ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1176the following demonstrates:
1177
1178 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1179
1180which prints out 'word' three times, but
1181
1182 'words'=~/(word|word|word)(?{ print $1 })S/
1183
1184which doesnt print it out at all. This is due to other optimisations kicking in.
1185
1186Example of what happens on a structural level:
1187
486ec47a 1188The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1189
1190 1: CURLYM[1] {1,32767}(18)
1191 5: BRANCH(8)
1192 6: EXACT <ac>(16)
1193 8: BRANCH(11)
1194 9: EXACT <ad>(16)
1195 11: BRANCH(14)
1196 12: EXACT <ab>(16)
1197 16: SUCCEED(0)
1198 17: NOTHING(18)
1199 18: END(0)
1200
1201This would be optimizable with startbranch=5, first=5, last=16, tail=16
1202and should turn into:
1203
1204 1: CURLYM[1] {1,32767}(18)
1205 5: TRIE(16)
1206 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1207 <ac>
1208 <ad>
1209 <ab>
1210 16: SUCCEED(0)
1211 17: NOTHING(18)
1212 18: END(0)
1213
1214Cases where tail != last would be like /(?foo|bar)baz/:
1215
1216 1: BRANCH(4)
1217 2: EXACT <foo>(8)
1218 4: BRANCH(7)
1219 5: EXACT <bar>(8)
1220 7: TAIL(8)
1221 8: EXACT <baz>(10)
1222 10: END(0)
1223
1224which would be optimizable with startbranch=1, first=1, last=7, tail=8
1225and would end up looking like:
1226
1227 1: TRIE(8)
1228 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1229 <foo>
1230 <bar>
1231 7: TAIL(8)
1232 8: EXACT <baz>(10)
1233 10: END(0)
1234
1235 d = uvuni_to_utf8_flags(d, uv, 0);
1236
1237is the recommended Unicode-aware way of saying
1238
1239 *(d++) = uv;
1240*/
1241
1e2e3d02 1242#define TRIE_STORE_REVCHAR \
786e8c11 1243 STMT_START { \
73031816
NC
1244 if (UTF) { \
1245 SV *zlopp = newSV(2); \
88c9ea1e
CB
1246 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1247 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1248 SvCUR_set(zlopp, kapow - flrbbbbb); \
1249 SvPOK_on(zlopp); \
1250 SvUTF8_on(zlopp); \
1251 av_push(revcharmap, zlopp); \
1252 } else { \
6bdeddd2 1253 char ooooff = (char)uvc; \
73031816
NC
1254 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1255 } \
1256 } STMT_END
786e8c11
YO
1257
1258#define TRIE_READ_CHAR STMT_START { \
1259 wordlen++; \
1260 if ( UTF ) { \
1261 if ( folder ) { \
1262 if ( foldlen > 0 ) { \
1263 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1264 foldlen -= len; \
1265 scan += len; \
1266 len = 0; \
1267 } else { \
1268 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1269 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1270 foldlen -= UNISKIP( uvc ); \
1271 scan = foldbuf + UNISKIP( uvc ); \
1272 } \
1273 } else { \
1274 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1275 } \
1276 } else { \
1277 uvc = (U32)*uc; \
1278 len = 1; \
1279 } \
1280} STMT_END
1281
1282
1283
1284#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1285 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1286 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1287 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1288 } \
1289 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1290 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1291 TRIE_LIST_CUR( state )++; \
1292} STMT_END
07be1b83 1293
786e8c11
YO
1294#define TRIE_LIST_NEW(state) STMT_START { \
1295 Newxz( trie->states[ state ].trans.list, \
1296 4, reg_trie_trans_le ); \
1297 TRIE_LIST_CUR( state ) = 1; \
1298 TRIE_LIST_LEN( state ) = 4; \
1299} STMT_END
07be1b83 1300
786e8c11
YO
1301#define TRIE_HANDLE_WORD(state) STMT_START { \
1302 U16 dupe= trie->states[ state ].wordnum; \
1303 regnode * const noper_next = regnext( noper ); \
1304 \
786e8c11
YO
1305 DEBUG_r({ \
1306 /* store the word for dumping */ \
1307 SV* tmp; \
1308 if (OP(noper) != NOTHING) \
740cce10 1309 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1310 else \
740cce10 1311 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1312 av_push( trie_words, tmp ); \
786e8c11
YO
1313 }); \
1314 \
1315 curword++; \
2e64971a
DM
1316 trie->wordinfo[curword].prev = 0; \
1317 trie->wordinfo[curword].len = wordlen; \
1318 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1319 \
1320 if ( noper_next < tail ) { \
1321 if (!trie->jump) \
c944940b 1322 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1323 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1324 if (!jumper) \
1325 jumper = noper_next; \
1326 if (!nextbranch) \
1327 nextbranch= regnext(cur); \
1328 } \
1329 \
1330 if ( dupe ) { \
2e64971a
DM
1331 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1332 /* chain, so that when the bits of chain are later */\
1333 /* linked together, the dups appear in the chain */\
1334 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1335 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1336 } else { \
1337 /* we haven't inserted this word yet. */ \
1338 trie->states[ state ].wordnum = curword; \
1339 } \
1340} STMT_END
07be1b83 1341
3dab1dad 1342
786e8c11
YO
1343#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1344 ( ( base + charid >= ucharcount \
1345 && base + charid < ubound \
1346 && state == trie->trans[ base - ucharcount + charid ].check \
1347 && trie->trans[ base - ucharcount + charid ].next ) \
1348 ? trie->trans[ base - ucharcount + charid ].next \
1349 : ( state==1 ? special : 0 ) \
1350 )
3dab1dad 1351
786e8c11
YO
1352#define MADE_TRIE 1
1353#define MADE_JUMP_TRIE 2
1354#define MADE_EXACT_TRIE 4
3dab1dad 1355
a3621e74 1356STATIC I32
786e8c11 1357S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1358{
27da23d5 1359 dVAR;
a3621e74
YO
1360 /* first pass, loop through and scan words */
1361 reg_trie_data *trie;
55eed653 1362 HV *widecharmap = NULL;
2b8b4781 1363 AV *revcharmap = newAV();
a3621e74 1364 regnode *cur;
9f7f3913 1365 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1366 STRLEN len = 0;
1367 UV uvc = 0;
1368 U16 curword = 0;
1369 U32 next_alloc = 0;
786e8c11
YO
1370 regnode *jumper = NULL;
1371 regnode *nextbranch = NULL;
7f69552c 1372 regnode *convert = NULL;
2e64971a 1373 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1374 /* we just use folder as a flag in utf8 */
1e696034 1375 const U8 * folder = NULL;
a3621e74 1376
2b8b4781
NC
1377#ifdef DEBUGGING
1378 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1379 AV *trie_words = NULL;
1380 /* along with revcharmap, this only used during construction but both are
1381 * useful during debugging so we store them in the struct when debugging.
8e11feef 1382 */
2b8b4781
NC
1383#else
1384 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1385 STRLEN trie_charcount=0;
3dab1dad 1386#endif
2b8b4781 1387 SV *re_trie_maxbuff;
a3621e74 1388 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1389
1390 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1391#ifndef DEBUGGING
1392 PERL_UNUSED_ARG(depth);
1393#endif
a3621e74 1394
1e696034
KW
1395 switch (flags) {
1396 case EXACTFU: folder = PL_fold_latin1; break;
1397 case EXACTF: folder = PL_fold; break;
1398 case EXACTFL: folder = PL_fold_locale; break;
1399 }
1400
c944940b 1401 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1402 trie->refcount = 1;
3dab1dad 1403 trie->startstate = 1;
786e8c11 1404 trie->wordcount = word_count;
f8fc2ecf 1405 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1406 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1407 if (!(UTF && folder))
c944940b 1408 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1409 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1410 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1411
a3621e74 1412 DEBUG_r({
2b8b4781 1413 trie_words = newAV();
a3621e74 1414 });
a3621e74 1415
0111c4fd 1416 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1417 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1418 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1419 }
3dab1dad
YO
1420 DEBUG_OPTIMISE_r({
1421 PerlIO_printf( Perl_debug_log,
786e8c11 1422 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1423 (int)depth * 2 + 2, "",
1424 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1425 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1426 (int)depth);
3dab1dad 1427 });
7f69552c
YO
1428
1429 /* Find the node we are going to overwrite */
1430 if ( first == startbranch && OP( last ) != BRANCH ) {
1431 /* whole branch chain */
1432 convert = first;
1433 } else {
1434 /* branch sub-chain */
1435 convert = NEXTOPER( first );
1436 }
1437
a3621e74
YO
1438 /* -- First loop and Setup --
1439
1440 We first traverse the branches and scan each word to determine if it
1441 contains widechars, and how many unique chars there are, this is
1442 important as we have to build a table with at least as many columns as we
1443 have unique chars.
1444
1445 We use an array of integers to represent the character codes 0..255
38a44b82 1446 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1447 native representation of the character value as the key and IV's for the
1448 coded index.
1449
1450 *TODO* If we keep track of how many times each character is used we can
1451 remap the columns so that the table compression later on is more
3b753521 1452 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1453 middle and the least common are on the outside. IMO this would be better
1454 than a most to least common mapping as theres a decent chance the most
1455 common letter will share a node with the least common, meaning the node
486ec47a 1456 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1457 case is when we have the least common nodes twice.
1458
1459 */
1460
a3621e74 1461 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1462 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1463 const U8 *uc = (U8*)STRING( noper );
a28509cc 1464 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1465 STRLEN foldlen = 0;
1466 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1467 const U8 *scan = (U8*)NULL;
07be1b83 1468 U32 wordlen = 0; /* required init */
02daf0ab
YO
1469 STRLEN chars = 0;
1470 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1471
3dab1dad
YO
1472 if (OP(noper) == NOTHING) {
1473 trie->minlen= 0;
1474 continue;
1475 }
02daf0ab
YO
1476 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1477 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1478 regardless of encoding */
1479
a3621e74 1480 for ( ; uc < e ; uc += len ) {
3dab1dad 1481 TRIE_CHARCOUNT(trie)++;
a3621e74 1482 TRIE_READ_CHAR;
3dab1dad 1483 chars++;
a3621e74
YO
1484 if ( uvc < 256 ) {
1485 if ( !trie->charmap[ uvc ] ) {
1486 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1487 if ( folder )
1488 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1489 TRIE_STORE_REVCHAR;
a3621e74 1490 }
02daf0ab 1491 if ( set_bit ) {
62012aee
KW
1492 /* store the codepoint in the bitmap, and its folded
1493 * equivalent. */
02daf0ab 1494 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1495
1496 /* store the folded codepoint */
1497 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1498
1499 if ( !UTF ) {
1500 /* store first byte of utf8 representation of
acdf4139
KW
1501 variant codepoints */
1502 if (! UNI_IS_INVARIANT(uvc)) {
1503 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1504 }
1505 }
02daf0ab
YO
1506 set_bit = 0; /* We've done our bit :-) */
1507 }
a3621e74
YO
1508 } else {
1509 SV** svpp;
55eed653
NC
1510 if ( !widecharmap )
1511 widecharmap = newHV();
a3621e74 1512
55eed653 1513 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1514
1515 if ( !svpp )
e4584336 1516 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1517
1518 if ( !SvTRUE( *svpp ) ) {
1519 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1520 TRIE_STORE_REVCHAR;
a3621e74
YO
1521 }
1522 }
1523 }
3dab1dad
YO
1524 if( cur == first ) {
1525 trie->minlen=chars;
1526 trie->maxlen=chars;
1527 } else if (chars < trie->minlen) {
1528 trie->minlen=chars;
1529 } else if (chars > trie->maxlen) {
1530 trie->maxlen=chars;
1531 }
1532
a3621e74
YO
1533 } /* end first pass */
1534 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1535 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1536 (int)depth * 2 + 2,"",
55eed653 1537 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1538 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1539 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1540 );
a3621e74
YO
1541
1542 /*
1543 We now know what we are dealing with in terms of unique chars and
1544 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1545 representation using a flat table will take. If it's over a reasonable
1546 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1547 conservative but potentially much slower representation using an array
1548 of lists.
1549
1550 At the end we convert both representations into the same compressed
1551 form that will be used in regexec.c for matching with. The latter
1552 is a form that cannot be used to construct with but has memory
1553 properties similar to the list form and access properties similar
1554 to the table form making it both suitable for fast searches and
1555 small enough that its feasable to store for the duration of a program.
1556
1557 See the comment in the code where the compressed table is produced
1558 inplace from the flat tabe representation for an explanation of how
1559 the compression works.
1560
1561 */
1562
1563
2e64971a
DM
1564 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1565 prev_states[1] = 0;
1566
3dab1dad 1567 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1568 /*
1569 Second Pass -- Array Of Lists Representation
1570
1571 Each state will be represented by a list of charid:state records
1572 (reg_trie_trans_le) the first such element holds the CUR and LEN
1573 points of the allocated array. (See defines above).
1574
1575 We build the initial structure using the lists, and then convert
1576 it into the compressed table form which allows faster lookups
1577 (but cant be modified once converted).
a3621e74
YO
1578 */
1579
a3621e74
YO
1580 STRLEN transcount = 1;
1581
1e2e3d02
YO
1582 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1583 "%*sCompiling trie using list compiler\n",
1584 (int)depth * 2 + 2, ""));
446bd890 1585
c944940b
JH
1586 trie->states = (reg_trie_state *)
1587 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1588 sizeof(reg_trie_state) );
a3621e74
YO
1589 TRIE_LIST_NEW(1);
1590 next_alloc = 2;
1591
1592 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1593
c445ea15
AL
1594 regnode * const noper = NEXTOPER( cur );
1595 U8 *uc = (U8*)STRING( noper );
1596 const U8 * const e = uc + STR_LEN( noper );
1597 U32 state = 1; /* required init */
1598 U16 charid = 0; /* sanity init */
1599 U8 *scan = (U8*)NULL; /* sanity init */
1600 STRLEN foldlen = 0; /* required init */
07be1b83 1601 U32 wordlen = 0; /* required init */
c445ea15
AL
1602 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1603
3dab1dad 1604 if (OP(noper) != NOTHING) {
786e8c11 1605 for ( ; uc < e ; uc += len ) {
c445ea15 1606
786e8c11 1607 TRIE_READ_CHAR;
c445ea15 1608
786e8c11
YO
1609 if ( uvc < 256 ) {
1610 charid = trie->charmap[ uvc ];
c445ea15 1611 } else {
55eed653 1612 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1613 if ( !svpp ) {
1614 charid = 0;
1615 } else {
1616 charid=(U16)SvIV( *svpp );
1617 }
c445ea15 1618 }
786e8c11
YO
1619 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1620 if ( charid ) {
a3621e74 1621
786e8c11
YO
1622 U16 check;
1623 U32 newstate = 0;
a3621e74 1624
786e8c11
YO
1625 charid--;
1626 if ( !trie->states[ state ].trans.list ) {
1627 TRIE_LIST_NEW( state );
c445ea15 1628 }
786e8c11
YO
1629 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1630 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1631 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1632 break;
1633 }
1634 }
1635 if ( ! newstate ) {
1636 newstate = next_alloc++;
2e64971a 1637 prev_states[newstate] = state;
786e8c11
YO
1638 TRIE_LIST_PUSH( state, charid, newstate );
1639 transcount++;
1640 }
1641 state = newstate;
1642 } else {
1643 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1644 }
a28509cc 1645 }
c445ea15 1646 }
3dab1dad 1647 TRIE_HANDLE_WORD(state);
a3621e74
YO
1648
1649 } /* end second pass */
1650
1e2e3d02
YO
1651 /* next alloc is the NEXT state to be allocated */
1652 trie->statecount = next_alloc;
c944940b
JH
1653 trie->states = (reg_trie_state *)
1654 PerlMemShared_realloc( trie->states,
1655 next_alloc
1656 * sizeof(reg_trie_state) );
a3621e74 1657
3dab1dad 1658 /* and now dump it out before we compress it */
2b8b4781
NC
1659 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1660 revcharmap, next_alloc,
1661 depth+1)
1e2e3d02 1662 );
a3621e74 1663
c944940b
JH
1664 trie->trans = (reg_trie_trans *)
1665 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1666 {
1667 U32 state;
a3621e74
YO
1668 U32 tp = 0;
1669 U32 zp = 0;
1670
1671
1672 for( state=1 ; state < next_alloc ; state ++ ) {
1673 U32 base=0;
1674
1675 /*
1676 DEBUG_TRIE_COMPILE_MORE_r(
1677 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1678 );
1679 */
1680
1681 if (trie->states[state].trans.list) {
1682 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1683 U16 maxid=minid;
a28509cc 1684 U16 idx;
a3621e74
YO
1685
1686 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1687 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1688 if ( forid < minid ) {
1689 minid=forid;
1690 } else if ( forid > maxid ) {
1691 maxid=forid;
1692 }
a3621e74
YO
1693 }
1694 if ( transcount < tp + maxid - minid + 1) {
1695 transcount *= 2;
c944940b
JH
1696 trie->trans = (reg_trie_trans *)
1697 PerlMemShared_realloc( trie->trans,
446bd890
NC
1698 transcount
1699 * sizeof(reg_trie_trans) );
a3621e74
YO
1700 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1701 }
1702 base = trie->uniquecharcount + tp - minid;
1703 if ( maxid == minid ) {
1704 U32 set = 0;
1705 for ( ; zp < tp ; zp++ ) {
1706 if ( ! trie->trans[ zp ].next ) {
1707 base = trie->uniquecharcount + zp - minid;
1708 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1709 trie->trans[ zp ].check = state;
1710 set = 1;
1711 break;
1712 }
1713 }
1714 if ( !set ) {
1715 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1716 trie->trans[ tp ].check = state;
1717 tp++;
1718 zp = tp;
1719 }
1720 } else {
1721 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1722 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1723 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1724 trie->trans[ tid ].check = state;
1725 }
1726 tp += ( maxid - minid + 1 );
1727 }
1728 Safefree(trie->states[ state ].trans.list);
1729 }
1730 /*
1731 DEBUG_TRIE_COMPILE_MORE_r(
1732 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1733 );
1734 */
1735 trie->states[ state ].trans.base=base;
1736 }
cc601c31 1737 trie->lasttrans = tp + 1;
a3621e74
YO
1738 }
1739 } else {
1740 /*
1741 Second Pass -- Flat Table Representation.
1742
1743 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1744 We know that we will need Charcount+1 trans at most to store the data
1745 (one row per char at worst case) So we preallocate both structures
1746 assuming worst case.
1747
1748 We then construct the trie using only the .next slots of the entry
1749 structs.
1750
3b753521 1751 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1752 make compression both faster and easier by keeping track of how many non
1753 zero fields are in the node.
1754
1755 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1756 transition.
1757
1758 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1759 number representing the first entry of the node, and state as a
1760 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1761 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1762 are 2 entrys per node. eg:
1763
1764 A B A B
1765 1. 2 4 1. 3 7
1766 2. 0 3 3. 0 5
1767 3. 0 0 5. 0 0
1768 4. 0 0 7. 0 0
1769
1770 The table is internally in the right hand, idx form. However as we also
1771 have to deal with the states array which is indexed by nodenum we have to
1772 use TRIE_NODENUM() to convert.
1773
1774 */
1e2e3d02
YO
1775 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1776 "%*sCompiling trie using table compiler\n",
1777 (int)depth * 2 + 2, ""));
3dab1dad 1778
c944940b
JH
1779 trie->trans = (reg_trie_trans *)
1780 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1781 * trie->uniquecharcount + 1,
1782 sizeof(reg_trie_trans) );
1783 trie->states = (reg_trie_state *)
1784 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1785 sizeof(reg_trie_state) );
a3621e74
YO
1786 next_alloc = trie->uniquecharcount + 1;
1787
3dab1dad 1788
a3621e74
YO
1789 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1790
c445ea15 1791 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1792 const U8 *uc = (U8*)STRING( noper );
1793 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1794
1795 U32 state = 1; /* required init */
1796
1797 U16 charid = 0; /* sanity init */
1798 U32 accept_state = 0; /* sanity init */
1799 U8 *scan = (U8*)NULL; /* sanity init */
1800
1801 STRLEN foldlen = 0; /* required init */
07be1b83 1802 U32 wordlen = 0; /* required init */
a3621e74
YO
1803 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1804
3dab1dad 1805 if ( OP(noper) != NOTHING ) {
786e8c11 1806 for ( ; uc < e ; uc += len ) {
a3621e74 1807
786e8c11 1808 TRIE_READ_CHAR;
a3621e74 1809
786e8c11
YO
1810 if ( uvc < 256 ) {
1811 charid = trie->charmap[ uvc ];
1812 } else {
55eed653 1813 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1814 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1815 }
786e8c11
YO
1816 if ( charid ) {
1817 charid--;
1818 if ( !trie->trans[ state + charid ].next ) {
1819 trie->trans[ state + charid ].next = next_alloc;
1820 trie->trans[ state ].check++;
2e64971a
DM
1821 prev_states[TRIE_NODENUM(next_alloc)]
1822 = TRIE_NODENUM(state);
786e8c11
YO
1823 next_alloc += trie->uniquecharcount;
1824 }
1825 state = trie->trans[ state + charid ].next;
1826 } else {
1827 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1828 }
1829 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1830 }
a3621e74 1831 }
3dab1dad
YO
1832 accept_state = TRIE_NODENUM( state );
1833 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1834
1835 } /* end second pass */
1836
3dab1dad 1837 /* and now dump it out before we compress it */
2b8b4781
NC
1838 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1839 revcharmap,
1840 next_alloc, depth+1));
a3621e74 1841
a3621e74
YO
1842 {
1843 /*
1844 * Inplace compress the table.*
1845
1846 For sparse data sets the table constructed by the trie algorithm will
1847 be mostly 0/FAIL transitions or to put it another way mostly empty.
1848 (Note that leaf nodes will not contain any transitions.)
1849
1850 This algorithm compresses the tables by eliminating most such
1851 transitions, at the cost of a modest bit of extra work during lookup:
1852
1853 - Each states[] entry contains a .base field which indicates the
1854 index in the state[] array wheres its transition data is stored.
1855
3b753521 1856 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1857
1858 - If .base is nonzero then charid is added to it to find an entry in
1859 the trans array.
1860
1861 -If trans[states[state].base+charid].check!=state then the
1862 transition is taken to be a 0/Fail transition. Thus if there are fail
1863 transitions at the front of the node then the .base offset will point
1864 somewhere inside the previous nodes data (or maybe even into a node
1865 even earlier), but the .check field determines if the transition is
1866 valid.
1867
786e8c11 1868 XXX - wrong maybe?
a3621e74 1869 The following process inplace converts the table to the compressed
3b753521 1870 table: We first do not compress the root node 1,and mark all its
a3621e74 1871 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1872 allows us to do a DFA construction from the compressed table later,
1873 and ensures that any .base pointers we calculate later are greater
1874 than 0.
a3621e74
YO
1875
1876 - We set 'pos' to indicate the first entry of the second node.
1877
1878 - We then iterate over the columns of the node, finding the first and
1879 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1880 and set the .check pointers accordingly, and advance pos
1881 appropriately and repreat for the next node. Note that when we copy
1882 the next pointers we have to convert them from the original
1883 NODEIDX form to NODENUM form as the former is not valid post
1884 compression.
1885
1886 - If a node has no transitions used we mark its base as 0 and do not
1887 advance the pos pointer.
1888
1889 - If a node only has one transition we use a second pointer into the
1890 structure to fill in allocated fail transitions from other states.
1891 This pointer is independent of the main pointer and scans forward
1892 looking for null transitions that are allocated to a state. When it
1893 finds one it writes the single transition into the "hole". If the
786e8c11 1894 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1895
1896 - Once compressed we can Renew/realloc the structures to release the
1897 excess space.
1898
1899 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1900 specifically Fig 3.47 and the associated pseudocode.
1901
1902 demq
1903 */
a3b680e6 1904 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1905 U32 state, charid;
a3621e74 1906 U32 pos = 0, zp=0;
1e2e3d02 1907 trie->statecount = laststate;
a3621e74
YO
1908
1909 for ( state = 1 ; state < laststate ; state++ ) {
1910 U8 flag = 0;
a28509cc
AL
1911 const U32 stateidx = TRIE_NODEIDX( state );
1912 const U32 o_used = trie->trans[ stateidx ].check;
1913 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1914 trie->trans[ stateidx ].check = 0;
1915
1916 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1917 if ( flag || trie->trans[ stateidx + charid ].next ) {
1918 if ( trie->trans[ stateidx + charid ].next ) {
1919 if (o_used == 1) {
1920 for ( ; zp < pos ; zp++ ) {
1921 if ( ! trie->trans[ zp ].next ) {
1922 break;
1923 }
1924 }
1925 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1926 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1927 trie->trans[ zp ].check = state;
1928 if ( ++zp > pos ) pos = zp;
1929 break;
1930 }
1931 used--;
1932 }
1933 if ( !flag ) {
1934 flag = 1;
1935 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1936 }
1937 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1938 trie->trans[ pos ].check = state;
1939 pos++;
1940 }
1941 }
1942 }
cc601c31 1943 trie->lasttrans = pos + 1;
c944940b
JH
1944 trie->states = (reg_trie_state *)
1945 PerlMemShared_realloc( trie->states, laststate
1946 * sizeof(reg_trie_state) );
a3621e74 1947 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1948 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1949 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1950 (int)depth * 2 + 2,"",
1951 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1952 (IV)next_alloc,
1953 (IV)pos,
a3621e74
YO
1954 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1955 );
1956
1957 } /* end table compress */
1958 }
1e2e3d02
YO
1959 DEBUG_TRIE_COMPILE_MORE_r(
1960 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1961 (int)depth * 2 + 2, "",
1962 (UV)trie->statecount,
1963 (UV)trie->lasttrans)
1964 );
cc601c31 1965 /* resize the trans array to remove unused space */
c944940b
JH
1966 trie->trans = (reg_trie_trans *)
1967 PerlMemShared_realloc( trie->trans, trie->lasttrans
1968 * sizeof(reg_trie_trans) );
a3621e74 1969
3b753521 1970 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1971 U8 nodetype =(U8)(flags & 0xFF);
1972 char *str=NULL;
786e8c11 1973
07be1b83 1974#ifdef DEBUGGING
e62cc96a 1975 regnode *optimize = NULL;
7122b237
YO
1976#ifdef RE_TRACK_PATTERN_OFFSETS
1977
b57a0404
JH
1978 U32 mjd_offset = 0;
1979 U32 mjd_nodelen = 0;
7122b237
YO
1980#endif /* RE_TRACK_PATTERN_OFFSETS */
1981#endif /* DEBUGGING */
a3621e74 1982 /*
3dab1dad
YO
1983 This means we convert either the first branch or the first Exact,
1984 depending on whether the thing following (in 'last') is a branch
1985 or not and whther first is the startbranch (ie is it a sub part of
1986 the alternation or is it the whole thing.)
3b753521 1987 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1988 the whole branch sequence, including the first.
a3621e74 1989 */
3dab1dad 1990 /* Find the node we are going to overwrite */
7f69552c 1991 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1992 /* branch sub-chain */
3dab1dad 1993 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1994#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1995 DEBUG_r({
1996 mjd_offset= Node_Offset((convert));
1997 mjd_nodelen= Node_Length((convert));
1998 });
7122b237 1999#endif
7f69552c 2000 /* whole branch chain */
7122b237
YO
2001 }
2002#ifdef RE_TRACK_PATTERN_OFFSETS
2003 else {
7f69552c
YO
2004 DEBUG_r({
2005 const regnode *nop = NEXTOPER( convert );
2006 mjd_offset= Node_Offset((nop));
2007 mjd_nodelen= Node_Length((nop));
2008 });
07be1b83
YO
2009 }
2010 DEBUG_OPTIMISE_r(
2011 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2012 (int)depth * 2 + 2, "",
786e8c11 2013 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2014 );
7122b237 2015#endif
3dab1dad
YO
2016 /* But first we check to see if there is a common prefix we can
2017 split out as an EXACT and put in front of the TRIE node. */
2018 trie->startstate= 1;
55eed653 2019 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2020 U32 state;
1e2e3d02 2021 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2022 U32 ofs = 0;
8e11feef
RGS
2023 I32 idx = -1;
2024 U32 count = 0;
2025 const U32 base = trie->states[ state ].trans.base;
a3621e74 2026
3dab1dad 2027 if ( trie->states[state].wordnum )
8e11feef 2028 count = 1;
a3621e74 2029
8e11feef 2030 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2031 if ( ( base + ofs >= trie->uniquecharcount ) &&
2032 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2033 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2034 {
3dab1dad 2035 if ( ++count > 1 ) {
2b8b4781 2036 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2037 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2038 if ( state == 1 ) break;
3dab1dad
YO
2039 if ( count == 2 ) {
2040 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2041 DEBUG_OPTIMISE_r(
8e11feef
RGS
2042 PerlIO_printf(Perl_debug_log,
2043 "%*sNew Start State=%"UVuf" Class: [",
2044 (int)depth * 2 + 2, "",
786e8c11 2045 (UV)state));
be8e71aa 2046 if (idx >= 0) {
2b8b4781 2047 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2048 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2049
3dab1dad 2050 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2051 if ( folder )
2052 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2053 DEBUG_OPTIMISE_r(
f1f66076 2054 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2055 );
8e11feef
RGS
2056 }
2057 }
2058 TRIE_BITMAP_SET(trie,*ch);
2059 if ( folder )
2060 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2061 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2062 }
2063 idx = ofs;
2064 }
3dab1dad
YO
2065 }
2066 if ( count == 1 ) {
2b8b4781 2067 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2068 STRLEN len;
2069 char *ch = SvPV( *tmp, len );
de734bd5
A
2070 DEBUG_OPTIMISE_r({
2071 SV *sv=sv_newmortal();
8e11feef
RGS
2072 PerlIO_printf( Perl_debug_log,
2073 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2074 (int)depth * 2 + 2, "",
de734bd5
A
2075 (UV)state, (UV)idx,
2076 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2077 PL_colors[0], PL_colors[1],
2078 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2079 PERL_PV_ESCAPE_FIRSTCHAR
2080 )
2081 );
2082 });
3dab1dad
YO
2083 if ( state==1 ) {
2084 OP( convert ) = nodetype;
2085 str=STRING(convert);
2086 STR_LEN(convert)=0;
2087 }
c490c714
YO
2088 STR_LEN(convert) += len;
2089 while (len--)
de734bd5 2090 *str++ = *ch++;
8e11feef 2091 } else {
f9049ba1 2092#ifdef DEBUGGING
8e11feef
RGS
2093 if (state>1)
2094 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2095#endif
8e11feef
RGS
2096 break;
2097 }
2098 }
2e64971a 2099 trie->prefixlen = (state-1);
3dab1dad 2100 if (str) {
8e11feef 2101 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2102 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2103 trie->startstate = state;
07be1b83
YO
2104 trie->minlen -= (state - 1);
2105 trie->maxlen -= (state - 1);
33809eae
JH
2106#ifdef DEBUGGING
2107 /* At least the UNICOS C compiler choked on this
2108 * being argument to DEBUG_r(), so let's just have
2109 * it right here. */
2110 if (
2111#ifdef PERL_EXT_RE_BUILD
2112 1
2113#else
2114 DEBUG_r_TEST
2115#endif
2116 ) {
2117 regnode *fix = convert;
2118 U32 word = trie->wordcount;
2119 mjd_nodelen++;
2120 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2121 while( ++fix < n ) {
2122 Set_Node_Offset_Length(fix, 0, 0);
2123 }
2124 while (word--) {
2125 SV ** const tmp = av_fetch( trie_words, word, 0 );
2126 if (tmp) {
2127 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2128 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2129 else
2130 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2131 }
2132 }
2133 }
2134#endif
8e11feef
RGS
2135 if (trie->maxlen) {
2136 convert = n;
2137 } else {
3dab1dad 2138 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2139 DEBUG_r(optimize= n);
3dab1dad
YO
2140 }
2141 }
2142 }
a5ca303d
YO
2143 if (!jumper)
2144 jumper = last;
3dab1dad 2145 if ( trie->maxlen ) {
8e11feef
RGS
2146 NEXT_OFF( convert ) = (U16)(tail - convert);
2147 ARG_SET( convert, data_slot );
786e8c11
YO
2148 /* Store the offset to the first unabsorbed branch in
2149 jump[0], which is otherwise unused by the jump logic.
2150 We use this when dumping a trie and during optimisation. */
2151 if (trie->jump)
7f69552c 2152 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2153
6c48061a
YO
2154 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2155 * and there is a bitmap
2156 * and the first "jump target" node we found leaves enough room
2157 * then convert the TRIE node into a TRIEC node, with the bitmap
2158 * embedded inline in the opcode - this is hypothetically faster.
2159 */
2160 if ( !trie->states[trie->startstate].wordnum
2161 && trie->bitmap
2162 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2163 {
2164 OP( convert ) = TRIEC;
2165 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2166 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2167 trie->bitmap= NULL;
2168 } else
2169 OP( convert ) = TRIE;
a3621e74 2170
3dab1dad
YO
2171 /* store the type in the flags */
2172 convert->flags = nodetype;
a5ca303d
YO
2173 DEBUG_r({
2174 optimize = convert
2175 + NODE_STEP_REGNODE
2176 + regarglen[ OP( convert ) ];
2177 });
2178 /* XXX We really should free up the resource in trie now,
2179 as we won't use them - (which resources?) dmq */
3dab1dad 2180 }
a3621e74 2181 /* needed for dumping*/
e62cc96a 2182 DEBUG_r(if (optimize) {
07be1b83 2183 regnode *opt = convert;
bcdf7404 2184
e62cc96a 2185 while ( ++opt < optimize) {
07be1b83
YO
2186 Set_Node_Offset_Length(opt,0,0);
2187 }
786e8c11
YO
2188 /*
2189 Try to clean up some of the debris left after the
2190 optimisation.
a3621e74 2191 */
786e8c11 2192 while( optimize < jumper ) {
07be1b83 2193 mjd_nodelen += Node_Length((optimize));
a3621e74 2194 OP( optimize ) = OPTIMIZED;
07be1b83 2195 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2196 optimize++;
2197 }
07be1b83 2198 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2199 });
2200 } /* end node insert */
2e64971a
DM
2201
2202 /* Finish populating the prev field of the wordinfo array. Walk back
2203 * from each accept state until we find another accept state, and if
2204 * so, point the first word's .prev field at the second word. If the
2205 * second already has a .prev field set, stop now. This will be the
2206 * case either if we've already processed that word's accept state,
3b753521
FN
2207 * or that state had multiple words, and the overspill words were
2208 * already linked up earlier.
2e64971a
DM
2209 */
2210 {
2211 U16 word;
2212 U32 state;
2213 U16 prev;
2214
2215 for (word=1; word <= trie->wordcount; word++) {
2216 prev = 0;
2217 if (trie->wordinfo[word].prev)
2218 continue;
2219 state = trie->wordinfo[word].accept;
2220 while (state) {
2221 state = prev_states[state];
2222 if (!state)
2223 break;
2224 prev = trie->states[state].wordnum;
2225 if (prev)
2226 break;
2227 }
2228 trie->wordinfo[word].prev = prev;
2229 }
2230 Safefree(prev_states);
2231 }
2232
2233
2234 /* and now dump out the compressed format */
2235 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2236
55eed653 2237 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2238#ifdef DEBUGGING
2239 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2240 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2241#else
2242 SvREFCNT_dec(revcharmap);
07be1b83 2243#endif
786e8c11
YO
2244 return trie->jump
2245 ? MADE_JUMP_TRIE
2246 : trie->startstate>1
2247 ? MADE_EXACT_TRIE
2248 : MADE_TRIE;
2249}
2250
2251STATIC void
2252S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2253{
3b753521 2254/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2255
2256 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2257 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2258 ISBN 0-201-10088-6
2259
2260 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2261 suffix of the current state's 'word' that is also a proper prefix of another word in our
2262 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2263 the DFA not to have to restart after its tried and failed a word at a given point, it
2264 simply continues as though it had been matching the other word in the first place.
2265 Consider
2266 'abcdgu'=~/abcdefg|cdgu/
2267 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2268 fail, which would bring us to the state representing 'd' in the second word where we would
2269 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2270 */
2271 /* add a fail transition */
3251b653
NC
2272 const U32 trie_offset = ARG(source);
2273 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2274 U32 *q;
2275 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2276 const U32 numstates = trie->statecount;
786e8c11
YO
2277 const U32 ubound = trie->lasttrans + ucharcount;
2278 U32 q_read = 0;
2279 U32 q_write = 0;
2280 U32 charid;
2281 U32 base = trie->states[ 1 ].trans.base;
2282 U32 *fail;
2283 reg_ac_data *aho;
2284 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2285 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2286
2287 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2288#ifndef DEBUGGING
2289 PERL_UNUSED_ARG(depth);
2290#endif
2291
2292
2293 ARG_SET( stclass, data_slot );
c944940b 2294 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2295 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2296 aho->trie=trie_offset;
446bd890
NC
2297 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2298 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2299 Newxz( q, numstates, U32);
c944940b 2300 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2301 aho->refcount = 1;
2302 fail = aho->fail;
2303 /* initialize fail[0..1] to be 1 so that we always have
2304 a valid final fail state */
2305 fail[ 0 ] = fail[ 1 ] = 1;
2306
2307 for ( charid = 0; charid < ucharcount ; charid++ ) {
2308 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2309 if ( newstate ) {
2310 q[ q_write ] = newstate;
2311 /* set to point at the root */
2312 fail[ q[ q_write++ ] ]=1;
2313 }
2314 }
2315 while ( q_read < q_write) {
2316 const U32 cur = q[ q_read++ % numstates ];
2317 base = trie->states[ cur ].trans.base;
2318
2319 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2320 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2321 if (ch_state) {
2322 U32 fail_state = cur;
2323 U32 fail_base;
2324 do {
2325 fail_state = fail[ fail_state ];
2326 fail_base = aho->states[ fail_state ].trans.base;
2327 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2328
2329 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2330 fail[ ch_state ] = fail_state;
2331 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2332 {
2333 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2334 }
2335 q[ q_write++ % numstates] = ch_state;
2336 }
2337 }
2338 }
2339 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2340 when we fail in state 1, this allows us to use the
2341 charclass scan to find a valid start char. This is based on the principle
2342 that theres a good chance the string being searched contains lots of stuff
2343 that cant be a start char.
2344 */
2345 fail[ 0 ] = fail[ 1 ] = 0;
2346 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2347 PerlIO_printf(Perl_debug_log,
2348 "%*sStclass Failtable (%"UVuf" states): 0",
2349 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2350 );
786e8c11
YO
2351 for( q_read=1; q_read<numstates; q_read++ ) {
2352 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2353 }
2354 PerlIO_printf(Perl_debug_log, "\n");
2355 });
2356 Safefree(q);
2357 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2358}
2359
786e8c11 2360
a3621e74 2361/*
5d1c421c
JH
2362 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2363 * These need to be revisited when a newer toolchain becomes available.
2364 */
2365#if defined(__sparc64__) && defined(__GNUC__)
2366# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2367# undef SPARC64_GCC_WORKAROUND
2368# define SPARC64_GCC_WORKAROUND 1
2369# endif
2370#endif
2371
07be1b83 2372#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2373 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2374 SV * const mysv=sv_newmortal(); \
2375 regnode *Next = regnext(scan); \
2376 regprop(RExC_rx, mysv, scan); \
7f69552c 2377 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2378 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2379 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2380 }});
07be1b83 2381
1de06328
YO
2382
2383
2384
2385
07be1b83
YO
2386#define JOIN_EXACT(scan,min,flags) \
2387 if (PL_regkind[OP(scan)] == EXACT) \
2388 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2389
be8e71aa 2390STATIC U32
07be1b83
YO
2391S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2392 /* Merge several consecutive EXACTish nodes into one. */
2393 regnode *n = regnext(scan);
2394 U32 stringok = 1;
2395 regnode *next = scan + NODE_SZ_STR(scan);
2396 U32 merged = 0;
2397 U32 stopnow = 0;
2398#ifdef DEBUGGING
2399 regnode *stop = scan;
72f13be8 2400 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2401#else
d47053eb
RGS
2402 PERL_UNUSED_ARG(depth);
2403#endif
7918f24d
NC
2404
2405 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2406#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2407 PERL_UNUSED_ARG(flags);
2408 PERL_UNUSED_ARG(val);
07be1b83 2409#endif
07be1b83
YO
2410 DEBUG_PEEP("join",scan,depth);
2411
2412 /* Skip NOTHING, merge EXACT*. */
2413 while (n &&
2414 ( PL_regkind[OP(n)] == NOTHING ||
2415 (stringok && (OP(n) == OP(scan))))
2416 && NEXT_OFF(n)
2417 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2418
2419 if (OP(n) == TAIL || n > next)
2420 stringok = 0;
2421 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2422 DEBUG_PEEP("skip:",n,depth);
2423 NEXT_OFF(scan) += NEXT_OFF(n);
2424 next = n + NODE_STEP_REGNODE;
2425#ifdef DEBUGGING
2426 if (stringok)
2427 stop = n;
2428#endif
2429 n = regnext(n);
2430 }
2431 else if (stringok) {
786e8c11 2432 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2433 regnode * const nnext = regnext(n);
2434
2435 DEBUG_PEEP("merg",n,depth);
2436
2437 merged++;
2438 if (oldl + STR_LEN(n) > U8_MAX)
2439 break;
2440 NEXT_OFF(scan) += NEXT_OFF(n);
2441 STR_LEN(scan) += STR_LEN(n);
2442 next = n + NODE_SZ_STR(n);
2443 /* Now we can overwrite *n : */
2444 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2445#ifdef DEBUGGING
2446 stop = next - 1;
2447#endif
2448 n = nnext;
2449 if (stopnow) break;
2450 }
2451
d47053eb
RGS
2452#ifdef EXPERIMENTAL_INPLACESCAN
2453 if (flags && !NEXT_OFF(n)) {
2454 DEBUG_PEEP("atch", val, depth);
2455 if (reg_off_by_arg[OP(n)]) {
2456 ARG_SET(n, val - n);
2457 }
2458 else {
2459 NEXT_OFF(n) = val - n;
2460 }
2461 stopnow = 1;
2462 }
07be1b83
YO
2463#endif
2464 }
ced7f090
KW
2465#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2466#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2467#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2468#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2469
2470 if (UTF
2471 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2472 && ( STR_LEN(scan) >= 6 ) )
2473 {
07be1b83
YO
2474 /*
2475 Two problematic code points in Unicode casefolding of EXACT nodes:
2476
2477 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2478 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2479
2480 which casefold to
2481
2482 Unicode UTF-8
2483
2484 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2485 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2486
2487 This means that in case-insensitive matching (or "loose matching",
2488 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2489 length of the above casefolded versions) can match a target string
2490 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2491 This would rather mess up the minimum length computation.
2492
2493 What we'll do is to look for the tail four bytes, and then peek
2494 at the preceding two bytes to see whether we need to decrease
2495 the minimum length by four (six minus two).
2496
2497 Thanks to the design of UTF-8, there cannot be false matches:
2498 A sequence of valid UTF-8 bytes cannot be a subsequence of
2499 another valid sequence of UTF-8 bytes.
2500
2501 */
2502 char * const s0 = STRING(scan), *s, *t;
2503 char * const s1 = s0 + STR_LEN(scan) - 1;
2504 char * const s2 = s1 - 4;
e294cc5d
JH
2505#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2506 const char t0[] = "\xaf\x49\xaf\x42";
2507#else
07be1b83 2508 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2509#endif
07be1b83
YO
2510 const char * const t1 = t0 + 3;
2511
2512 for (s = s0 + 2;
2513 s < s2 && (t = ninstr(s, s1, t0, t1));
2514 s = t + 4) {
e294cc5d
JH
2515#ifdef EBCDIC
2516 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2517 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2518#else
07be1b83
YO
2519 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2520 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2521#endif
07be1b83
YO
2522 *min -= 4;
2523 }
2524 }
2525
2526#ifdef DEBUGGING
2527 /* Allow dumping */
2528 n = scan + NODE_SZ_STR(scan);
2529 while (n <= stop) {
2530 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2531 OP(n) = OPTIMIZED;
2532 NEXT_OFF(n) = 0;
2533 }
2534 n++;
2535 }
2536#endif
2537 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2538 return stopnow;
2539}
2540
486ec47a 2541/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2542 Finds fixed substrings. */
2543
a0288114 2544/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2545 to the position after last scanned or to NULL. */
2546
40d049e4
YO
2547#define INIT_AND_WITHP \
2548 assert(!and_withp); \
2549 Newx(and_withp,1,struct regnode_charclass_class); \
2550 SAVEFREEPV(and_withp)
07be1b83 2551
b515a41d 2552/* this is a chain of data about sub patterns we are processing that
486ec47a 2553 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2554 we can simulate recursion without losing state. */
2555struct scan_frame;
2556typedef struct scan_frame {
2557 regnode *last; /* last node to process in this frame */
2558 regnode *next; /* next node to process when last is reached */
2559 struct scan_frame *prev; /*previous frame*/
2560 I32 stop; /* what stopparen do we use */
2561} scan_frame;
2562
304ee84b
YO
2563
2564#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2565
e1d1eefb
YO
2566#define CASE_SYNST_FNC(nAmE) \
2567case nAmE: \
2568 if (flags & SCF_DO_STCLASS_AND) { \
2569 for (value = 0; value < 256; value++) \
2570 if (!is_ ## nAmE ## _cp(value)) \
2571 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2572 } \
2573 else { \
2574 for (value = 0; value < 256; value++) \
2575 if (is_ ## nAmE ## _cp(value)) \
2576 ANYOF_BITMAP_SET(data->start_class, value); \
2577 } \
2578 break; \
2579case N ## nAmE: \
2580 if (flags & SCF_DO_STCLASS_AND) { \
2581 for (value = 0; value < 256; value++) \
2582 if (is_ ## nAmE ## _cp(value)) \
2583 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2584 } \
2585 else { \
2586 for (value = 0; value < 256; value++) \
2587 if (!is_ ## nAmE ## _cp(value)) \
2588 ANYOF_BITMAP_SET(data->start_class, value); \
2589 } \
2590 break
2591
2592
2593
76e3520e 2594STATIC I32
40d049e4 2595S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2596 I32 *minlenp, I32 *deltap,
40d049e4
YO
2597 regnode *last,
2598 scan_data_t *data,
2599 I32 stopparen,
2600 U8* recursed,
2601 struct regnode_charclass_class *and_withp,
2602 U32 flags, U32 depth)
c277df42
IZ
2603 /* scanp: Start here (read-write). */
2604 /* deltap: Write maxlen-minlen here. */
2605 /* last: Stop before this one. */
40d049e4
YO
2606 /* data: string data about the pattern */
2607 /* stopparen: treat close N as END */
2608 /* recursed: which subroutines have we recursed into */
2609 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2610{
97aff369 2611 dVAR;
c277df42
IZ
2612 I32 min = 0, pars = 0, code;
2613 regnode *scan = *scanp, *next;
2614 I32 delta = 0;
2615 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2616 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2617 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2618 scan_data_t data_fake;
a3621e74 2619 SV *re_trie_maxbuff = NULL;
786e8c11 2620 regnode *first_non_open = scan;
e2e6a0f1 2621 I32 stopmin = I32_MAX;
8aa23a47 2622 scan_frame *frame = NULL;
a3621e74 2623 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2624
7918f24d
NC
2625 PERL_ARGS_ASSERT_STUDY_CHUNK;
2626
13a24bad 2627#ifdef DEBUGGING
40d049e4 2628 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2629#endif
40d049e4 2630
786e8c11 2631 if ( depth == 0 ) {
40d049e4 2632 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2633 first_non_open=regnext(first_non_open);
2634 }
2635
b81d288d 2636
8aa23a47
YO
2637 fake_study_recurse:
2638 while ( scan && OP(scan) != END && scan < last ){
2639 /* Peephole optimizer: */
304ee84b 2640 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2641 DEBUG_PEEP("Peep",scan,depth);
2642 JOIN_EXACT(scan,&min,0);
2643
2644 /* Follow the next-chain of the current node and optimize
2645 away all the NOTHINGs from it. */
2646 if (OP(scan) != CURLYX) {
2647 const int max = (reg_off_by_arg[OP(scan)]
2648 ? I32_MAX
2649 /* I32 may be smaller than U16 on CRAYs! */
2650 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2651 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2652 int noff;
2653 regnode *n = scan;
2654
2655 /* Skip NOTHING and LONGJMP. */
2656 while ((n = regnext(n))
2657 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2658 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2659 && off + noff < max)
2660 off += noff;
2661 if (reg_off_by_arg[OP(scan)])
2662 ARG(scan) = off;
2663 else
2664 NEXT_OFF(scan) = off;
2665 }
a3621e74 2666
c277df42 2667
8aa23a47
YO
2668
2669 /* The principal pseudo-switch. Cannot be a switch, since we
2670 look into several different things. */
2671 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2672 || OP(scan) == IFTHEN) {
2673 next = regnext(scan);
2674 code = OP(scan);
2675 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2676
2677 if (OP(next) == code || code == IFTHEN) {
2678 /* NOTE - There is similar code to this block below for handling
2679 TRIE nodes on a re-study. If you change stuff here check there
2680 too. */
2681 I32 max1 = 0, min1 = I32_MAX, num = 0;
2682 struct regnode_charclass_class accum;
2683 regnode * const startbranch=scan;
2684
2685 if (flags & SCF_DO_SUBSTR)
304ee84b 2686 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2687 if (flags & SCF_DO_STCLASS)
2688 cl_init_zero(pRExC_state, &accum);
2689
2690 while (OP(scan) == code) {
2691 I32 deltanext, minnext, f = 0, fake;
2692 struct regnode_charclass_class this_class;
2693
2694 num++;
2695 data_fake.flags = 0;
2696 if (data) {
2697 data_fake.whilem_c = data->whilem_c;
2698 data_fake.last_closep = data->last_closep;
2699 }
2700 else
2701 data_fake.last_closep = &fake;
58e23c8d
YO
2702
2703 data_fake.pos_delta = delta;
8aa23a47
YO
2704 next = regnext(scan);
2705 scan = NEXTOPER(scan);
2706 if (code != BRANCH)
c277df42 2707 scan = NEXTOPER(scan);
8aa23a47
YO
2708 if (flags & SCF_DO_STCLASS) {
2709 cl_init(pRExC_state, &this_class);
2710 data_fake.start_class = &this_class;
2711 f = SCF_DO_STCLASS_AND;
58e23c8d 2712 }
8aa23a47
YO
2713 if (flags & SCF_WHILEM_VISITED_POS)
2714 f |= SCF_WHILEM_VISITED_POS;
2715
2716 /* we suppose the run is continuous, last=next...*/
2717 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2718 next, &data_fake,
2719 stopparen, recursed, NULL, f,depth+1);
2720 if (min1 > minnext)
2721 min1 = minnext;
2722 if (max1 < minnext + deltanext)
2723 max1 = minnext + deltanext;
2724 if (deltanext == I32_MAX)
2725 is_inf = is_inf_internal = 1;
2726 scan = next;
2727 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2728 pars++;
2729 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2730 if ( stopmin > minnext)
2731 stopmin = min + min1;
2732 flags &= ~SCF_DO_SUBSTR;
2733 if (data)
2734 data->flags |= SCF_SEEN_ACCEPT;
2735 }
2736 if (data) {
2737 if (data_fake.flags & SF_HAS_EVAL)
2738 data->flags |= SF_HAS_EVAL;
2739 data->whilem_c = data_fake.whilem_c;
3dab1dad 2740 }
8aa23a47
YO
2741 if (flags & SCF_DO_STCLASS)
2742 cl_or(pRExC_state, &accum, &this_class);
2743 }
2744 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2745 min1 = 0;
2746 if (flags & SCF_DO_SUBSTR) {
2747 data->pos_min += min1;
2748 data->pos_delta += max1 - min1;
2749 if (max1 != min1 || is_inf)
2750 data->longest = &(data->longest_float);
2751 }
2752 min += min1;
2753 delta += max1 - min1;
2754 if (flags & SCF_DO_STCLASS_OR) {
2755 cl_or(pRExC_state, data->start_class, &accum);
2756 if (min1) {
2757 cl_and(data->start_class, and_withp);
2758 flags &= ~SCF_DO_STCLASS;
653099ff 2759 }
8aa23a47
YO
2760 }
2761 else if (flags & SCF_DO_STCLASS_AND) {
2762 if (min1) {
2763 cl_and(data->start_class, &accum);
2764 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2765 }
8aa23a47
YO
2766 else {
2767 /* Switch to OR mode: cache the old value of
2768 * data->start_class */
2769 INIT_AND_WITHP;
2770 StructCopy(data->start_class, and_withp,
2771 struct regnode_charclass_class);
2772 flags &= ~SCF_DO_STCLASS_AND;
2773 StructCopy(&accum, data->start_class,
2774 struct regnode_charclass_class);
2775 flags |= SCF_DO_STCLASS_OR;
2776 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2777 }
8aa23a47 2778 }
a3621e74 2779
8aa23a47
YO
2780 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2781 /* demq.
a3621e74 2782
8aa23a47
YO
2783 Assuming this was/is a branch we are dealing with: 'scan' now
2784 points at the item that follows the branch sequence, whatever
2785 it is. We now start at the beginning of the sequence and look
2786 for subsequences of
a3621e74 2787
8aa23a47
YO
2788 BRANCH->EXACT=>x1
2789 BRANCH->EXACT=>x2
2790 tail
a3621e74 2791
8aa23a47 2792 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2793
486ec47a 2794 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2795 element into a trie and then add the subsequent branch exact
2796 strings to the trie.
a3621e74 2797
8aa23a47 2798 We have two cases
a3621e74 2799
3b753521 2800 1. patterns where the whole set of branches can be converted.
a3621e74 2801
8aa23a47 2802 2. patterns where only a subset can be converted.
a3621e74 2803
8aa23a47
YO
2804 In case 1 we can replace the whole set with a single regop
2805 for the trie. In case 2 we need to keep the start and end
3b753521 2806 branches so
a3621e74 2807
8aa23a47
YO
2808 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2809 becomes BRANCH TRIE; BRANCH X;
786e8c11 2810
8aa23a47
YO
2811 There is an additional case, that being where there is a
2812 common prefix, which gets split out into an EXACT like node
2813 preceding the TRIE node.
a3621e74 2814
8aa23a47
YO
2815 If x(1..n)==tail then we can do a simple trie, if not we make
2816 a "jump" trie, such that when we match the appropriate word
486ec47a 2817 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2818 a nested if into a case structure of sorts.
b515a41d 2819
8aa23a47
YO
2820 */
2821
2822 int made=0;
2823 if (!re_trie_maxbuff) {
2824 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2825 if (!SvIOK(re_trie_maxbuff))
2826 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2827 }
2828 if ( SvIV(re_trie_maxbuff)>=0 ) {
2829 regnode *cur;
2830 regnode *first = (regnode *)NULL;
2831 regnode *last = (regnode *)NULL;
2832 regnode *tail = scan;
2833 U8 optype = 0;
2834 U32 count=0;
a3621e74
YO
2835
2836#ifdef DEBUGGING
8aa23a47 2837 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2838#endif
8aa23a47
YO
2839 /* var tail is used because there may be a TAIL
2840 regop in the way. Ie, the exacts will point to the
2841 thing following the TAIL, but the last branch will
2842 point at the TAIL. So we advance tail. If we
2843 have nested (?:) we may have to move through several
2844 tails.
2845 */
2846
2847 while ( OP( tail ) == TAIL ) {
2848 /* this is the TAIL generated by (?:) */
2849 tail = regnext( tail );
2850 }
a3621e74 2851
8aa23a47
YO
2852
2853 DEBUG_OPTIMISE_r({
2854 regprop(RExC_rx, mysv, tail );
2855 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2856 (int)depth * 2 + 2, "",
2857 "Looking for TRIE'able sequences. Tail node is: ",
2858 SvPV_nolen_const( mysv )
2859 );
2860 });
2861
2862 /*
2863
2864 step through the branches, cur represents each
2865 branch, noper is the first thing to be matched
2866 as part of that branch and noper_next is the
2867 regnext() of that node. if noper is an EXACT
2868 and noper_next is the same as scan (our current
2869 position in the regex) then the EXACT branch is
2870 a possible optimization target. Once we have
486ec47a 2871 two or more consecutive such branches we can
8aa23a47
YO
2872 create a trie of the EXACT's contents and stich
2873 it in place. If the sequence represents all of
2874 the branches we eliminate the whole thing and
2875 replace it with a single TRIE. If it is a
2876 subsequence then we need to stitch it in. This
2877 means the first branch has to remain, and needs
2878 to be repointed at the item on the branch chain
2879 following the last branch optimized. This could
2880 be either a BRANCH, in which case the
2881 subsequence is internal, or it could be the
2882 item following the branch sequence in which
2883 case the subsequence is at the end.
2884
2885 */
2886
2887 /* dont use tail as the end marker for this traverse */
2888 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2889 regnode * const noper = NEXTOPER( cur );
b515a41d 2890#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2891 regnode * const noper_next = regnext( noper );
b515a41d
YO
2892#endif
2893
8aa23a47
YO
2894 DEBUG_OPTIMISE_r({
2895 regprop(RExC_rx, mysv, cur);
2896 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2897 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2898
2899 regprop(RExC_rx, mysv, noper);
2900 PerlIO_printf( Perl_debug_log, " -> %s",
2901 SvPV_nolen_const(mysv));
2902
2903 if ( noper_next ) {
2904 regprop(RExC_rx, mysv, noper_next );
2905 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2906 SvPV_nolen_const(mysv));
2907 }
2908 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2909 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2910 });
2911 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2912 : PL_regkind[ OP( noper ) ] == EXACT )
2913 || OP(noper) == NOTHING )
786e8c11 2914#ifdef NOJUMPTRIE
8aa23a47 2915 && noper_next == tail
786e8c11 2916#endif
8aa23a47
YO
2917 && count < U16_MAX)
2918 {
2919 count++;
2920 if ( !first || optype == NOTHING ) {
2921 if (!first) first = cur;
2922 optype = OP( noper );
2923 } else {
2924 last = cur;
2925 }
2926 } else {
a0a388a1 2927/*
0abd0d78
YO
2928 Currently we do not believe that the trie logic can
2929 handle case insensitive matching properly when the
2930 pattern is not unicode (thus forcing unicode semantics).
2931
2932 If/when this is fixed the following define can be swapped
2933 in below to fully enable trie logic.
2934
a0a388a1 2935#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2936
2937*/
2938#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2939
a0a388a1 2940 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2941 make_trie( pRExC_state,
2942 startbranch, first, cur, tail, count,
2943 optype, depth+1 );
2944 }
2945 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2946#ifdef NOJUMPTRIE
8aa23a47 2947 && noper_next == tail
786e8c11 2948#endif
8aa23a47
YO
2949 ){
2950 count = 1;
2951 first = cur;
2952 optype = OP( noper );
2953 } else {
2954 count = 0;
2955 first = NULL;
2956 optype = 0;
2957 }
2958 last = NULL;
2959 }
2960 }
2961 DEBUG_OPTIMISE_r({
2962 regprop(RExC_rx, mysv, cur);
2963 PerlIO_printf( Perl_debug_log,
2964 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2965 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2966
2967 });
a0a388a1
YO
2968
2969 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2970 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2971#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2972 if ( ((made == MADE_EXACT_TRIE &&
2973 startbranch == first)
2974 || ( first_non_open == first )) &&
2975 depth==0 ) {
2976 flags |= SCF_TRIE_RESTUDY;
2977 if ( startbranch == first
2978 && scan == tail )
2979 {
2980 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2981 }
2982 }
3dab1dad 2983#endif
8aa23a47
YO
2984 }
2985 }
2986
2987 } /* do trie */
2988
653099ff 2989 }
8aa23a47
YO
2990 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2991 scan = NEXTOPER(NEXTOPER(scan));
2992 } else /* single branch is optimized. */
2993 scan = NEXTOPER(scan);
2994 continue;
2995 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2996 scan_frame *newframe = NULL;
2997 I32 paren;
2998 regnode *start;
2999 regnode *end;
3000
3001 if (OP(scan) != SUSPEND) {
3002 /* set the pointer */
3003 if (OP(scan) == GOSUB) {
3004 paren = ARG(scan);
3005 RExC_recurse[ARG2L(scan)] = scan;
3006 start = RExC_open_parens[paren-1];
3007 end = RExC_close_parens[paren-1];
3008 } else {
3009 paren = 0;
f8fc2ecf 3010 start = RExC_rxi->program + 1;
8aa23a47
YO
3011 end = RExC_opend;
3012 }
3013 if (!recursed) {
3014 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3015 SAVEFREEPV(recursed);
3016 }
3017 if (!PAREN_TEST(recursed,paren+1)) {
3018 PAREN_SET(recursed,paren+1);
3019 Newx(newframe,1,scan_frame);
3020 } else {
3021 if (flags & SCF_DO_SUBSTR) {
304ee84b 3022 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3023 data->longest = &(data->longest_float);
3024 }
3025 is_inf = is_inf_internal = 1;
3026 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3027 cl_anything(pRExC_state, data->start_class);
3028 flags &= ~SCF_DO_STCLASS;
3029 }
3030 } else {
3031 Newx(newframe,1,scan_frame);
3032 paren = stopparen;
3033 start = scan+2;
3034 end = regnext(scan);
3035 }
3036 if (newframe) {
3037 assert(start);
3038 assert(end);
3039 SAVEFREEPV(newframe);
3040 newframe->next = regnext(scan);
3041 newframe->last = last;
3042 newframe->stop = stopparen;
3043 newframe->prev = frame;
3044
3045 frame = newframe;
3046 scan = start;
3047 stopparen = paren;
3048 last = end;
3049
3050 continue;
3051 }
3052 }
3053 else if (OP(scan) == EXACT) {
3054 I32 l = STR_LEN(scan);
3055 UV uc;
3056 if (UTF) {
3057 const U8 * const s = (U8*)STRING(scan);
3058 l = utf8_length(s, s + l);
3059 uc = utf8_to_uvchr(s, NULL);
3060 } else {
3061 uc = *((U8*)STRING(scan));
3062 }
3063 min += l;
3064 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3065 /* The code below prefers earlier match for fixed
3066 offset, later match for variable offset. */
3067 if (data->last_end == -1) { /* Update the start info. */
3068 data->last_start_min = data->pos_min;
3069 data->last_start_max = is_inf
3070 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3071 }
8aa23a47
YO
3072 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3073 if (UTF)
3074 SvUTF8_on(data->last_found);
3075 {
3076 SV * const sv = data->last_found;
3077 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3078 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3079 if (mg && mg->mg_len >= 0)
3080 mg->mg_len += utf8_length((U8*)STRING(scan),
3081 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3082 }
8aa23a47
YO
3083 data->last_end = data->pos_min + l;
3084 data->pos_min += l; /* As in the first entry. */
3085 data->flags &= ~SF_BEFORE_EOL;
3086 }
3087 if (flags & SCF_DO_STCLASS_AND) {
3088 /* Check whether it is compatible with what we know already! */
3089 int compat = 1;
3090
54251c2e 3091
486ec47a 3092 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3093 * in the bitmp and either 1) its bit or its fold is set, or 2)
3094 * it's for a locale. Even if there isn't unicode semantics
3095 * here, at runtime there may be because of matching against a
3096 * utf8 string, so accept a possible false positive for
3097 * latin1-range folds */
8aa23a47
YO
3098 if (uc >= 0x100 ||
3099 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3100 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3101 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3102 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47
YO
3103 )
3104 compat = 0;
3105 ANYOF_CLASS_ZERO(data->start_class);
3106 ANYOF_BITMAP_ZERO(data->start_class);
3107 if (compat)
3108 ANYOF_BITMAP_SET(data->start_class, uc);
3109 data->start_class->flags &= ~ANYOF_EOS;
3110 if (uc < 0x100)
3111 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3112 }
3113 else if (flags & SCF_DO_STCLASS_OR) {
3114 /* false positive possible if the class is case-folded */
3115 if (uc < 0x100)
3116 ANYOF_BITMAP_SET(data->start_class, uc);
3117 else
3118 data->start_class->flags |= ANYOF_UNICODE_ALL;
3119 data->start_class->flags &= ~ANYOF_EOS;
3120 cl_and(data->start_class, and_withp);
3121 }
3122 flags &= ~SCF_DO_STCLASS;
3123 }
3124 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3125 I32 l = STR_LEN(scan);
3126 UV uc = *((U8*)STRING(scan));
3127
3128 /* Search for fixed substrings supports EXACT only. */
3129 if (flags & SCF_DO_SUBSTR) {
3130 assert(data);
304ee84b 3131 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3132 }
3133 if (UTF) {
3134 const U8 * const s = (U8 *)STRING(scan);
3135 l = utf8_length(s, s + l);
3136 uc = utf8_to_uvchr(s, NULL);
3137 }
3138 min += l;
3139 if (flags & SCF_DO_SUBSTR)
3140 data->pos_min += l;
3141 if (flags & SCF_DO_STCLASS_AND) {
3142 /* Check whether it is compatible with what we know already! */
3143 int compat = 1;
8aa23a47 3144 if (uc >= 0x100 ||
54251c2e
KW
3145 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3146 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3147 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3148 {
8aa23a47 3149 compat = 0;
54251c2e 3150 }
8aa23a47
YO
3151 ANYOF_CLASS_ZERO(data->start_class);
3152 ANYOF_BITMAP_ZERO(data->start_class);
3153 if (compat) {
3154 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3155 data->start_class->flags &= ~ANYOF_EOS;
39065660 3156 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3157 if (OP(scan) == EXACTFL) {
8aa23a47 3158 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3159 }
3160 else {
3161
54251c2e
KW
3162 /* Also set the other member of the fold pair. In case
3163 * that unicode semantics is called for at runtime, use
3164 * the full latin1 fold. (Can't do this for locale,
3165 * because not known until runtime */
3166 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3167 }
653099ff 3168 }
8aa23a47
YO
3169 }
3170 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3171 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3172 /* false positive possible if the class is case-folded.
3173 Assume that the locale settings are the same... */
970c8436 3174 if (uc < 0x100) {
1aa99e6b 3175 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3176 if (OP(scan) != EXACTFL) {
3177
3178 /* And set the other member of the fold pair, but
3179 * can't do that in locale because not known until
3180 * run-time */
3181 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3182 PL_fold_latin1[uc]);
970c8436
KW
3183 }
3184 }
653099ff
GS
3185 data->start_class->flags &= ~ANYOF_EOS;
3186 }
8aa23a47 3187 cl_and(data->start_class, and_withp);
653099ff 3188 }
8aa23a47
YO
3189 flags &= ~SCF_DO_STCLASS;
3190 }
e52fc539 3191 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3192 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3193 I32 f = flags, pos_before = 0;
3194 regnode * const oscan = scan;
3195 struct regnode_charclass_class this_class;
3196 struct regnode_charclass_class *oclass = NULL;
3197 I32 next_is_eval = 0;
3198
3199 switch (PL_regkind[OP(scan)]) {
3200 case WHILEM: /* End of (?:...)* . */
3201 scan = NEXTOPER(scan);
3202 goto finish;
3203 case PLUS:
3204 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3205 next = NEXTOPER(scan);
3206 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3207 mincount = 1;
3208 maxcount = REG_INFTY;
3209 next = regnext(scan);
3210 scan = NEXTOPER(scan);
3211 goto do_curly;
3212 }
3213 }
3214 if (flags & SCF_DO_SUBSTR)
3215 data->pos_min++;
3216 min++;
3217 /* Fall through. */
3218 case STAR:
3219 if (flags & SCF_DO_STCLASS) {
3220 mincount = 0;
3221 maxcount = REG_INFTY;
3222 next = regnext(scan);
3223 scan = NEXTOPER(scan);
3224 goto do_curly;
3225 }
3226 is_inf = is_inf_internal = 1;
3227 scan = regnext(scan);
c277df42 3228 if (flags & SCF_DO_SUBSTR) {
304ee84b 3229 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3230 data->longest = &(data->longest_float);
c277df42 3231 }
8aa23a47
YO
3232 goto optimize_curly_tail;
3233 case CURLY:
3234 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3235 && (scan->flags == stopparen))
3236 {
3237 mincount = 1;
3238 maxcount = 1;
3239 } else {
3240 mincount = ARG1(scan);
3241 maxcount = ARG2(scan);
653099ff 3242 }
8aa23a47
YO
3243 next = regnext(scan);
3244 if (OP(scan) == CURLYX) {
3245 I32 lp = (data ? *(data->last_closep) : 0);
3246 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3247 }
8aa23a47
YO
3248 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3249 next_is_eval = (OP(scan) == EVAL);
3250 do_curly:
3251 if (flags & SCF_DO_SUBSTR) {
304ee84b 3252 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3253 pos_before = data->pos_min;
b45f050a 3254 }
8aa23a47
YO
3255 if (data) {
3256 fl = data->flags;
3257 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3258 if (is_inf)
3259 data->flags |= SF_IS_INF;
3260 }
3261 if (flags & SCF_DO_STCLASS) {
3262 cl_init(pRExC_state, &this_class);
3263 oclass = data->start_class;
3264 data->start_class = &this_class;
3265 f |= SCF_DO_STCLASS_AND;
3266 f &= ~SCF_DO_STCLASS_OR;
3267 }
779bcb7d
NC
3268 /* Exclude from super-linear cache processing any {n,m}
3269 regops for which the combination of input pos and regex
3270 pos is not enough information to determine if a match
3271 will be possible.
3272
3273 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3274 regex pos at the \s*, the prospects for a match depend not
3275 only on the input position but also on how many (bar\s*)
3276 repeats into the {4,8} we are. */
3277 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3278 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3279
8aa23a47
YO
3280 /* This will finish on WHILEM, setting scan, or on NULL: */
3281 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3282 last, data, stopparen, recursed, NULL,
3283 (mincount == 0
3284 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3285
8aa23a47
YO
3286 if (flags & SCF_DO_STCLASS)
3287 data->start_class = oclass;
3288 if (mincount == 0 || minnext == 0) {
3289 if (flags & SCF_DO_STCLASS_OR) {
3290 cl_or(pRExC_state, data->start_class, &this_class);
3291 }
3292 else if (flags & SCF_DO_STCLASS_AND) {
3293 /* Switch to OR mode: cache the old value of
3294 * data->start_class */
3295 INIT_AND_WITHP;
3296 StructCopy(data->start_class, and_withp,
3297 struct regnode_charclass_class);
3298 flags &= ~SCF_DO_STCLASS_AND;
3299 StructCopy(&this_class, data->start_class,
3300 struct regnode_charclass_class);
3301 flags |= SCF_DO_STCLASS_OR;
3302 data->start_class->flags |= ANYOF_EOS;
3303 }
3304 } else { /* Non-zero len */
3305 if (flags & SCF_DO_STCLASS_OR) {
3306 cl_or(pRExC_state, data->start_class, &this_class);
3307 cl_and(data->start_class, and_withp);
3308 }
3309 else if (flags & SCF_DO_STCLASS_AND)
3310 cl_and(data->start_class, &this_class);
3311 flags &= ~SCF_DO_STCLASS;
3312 }
3313 if (!scan) /* It was not CURLYX, but CURLY. */
3314 scan = next;
3315 if ( /* ? quantifier ok, except for (?{ ... }) */
3316 (next_is_eval || !(mincount == 0 && maxcount == 1))
3317 && (minnext == 0) && (deltanext == 0)
3318 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3319 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3320 {
668c081a
NC
3321 ckWARNreg(RExC_parse,
3322 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3323 }
3324
3325 min += minnext * mincount;
3326 is_inf_internal |= ((maxcount == REG_INFTY
3327 && (minnext + deltanext) > 0)
3328 || deltanext == I32_MAX);
3329 is_inf |= is_inf_internal;
3330 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3331
3332 /* Try powerful optimization CURLYX => CURLYN. */
3333 if ( OP(oscan) == CURLYX && data
3334 && data->flags & SF_IN_PAR
3335 && !(data->flags & SF_HAS_EVAL)
3336 && !deltanext && minnext == 1 ) {
3337 /* Try to optimize to CURLYN. */
3338 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3339 regnode * const nxt1 = nxt;
497b47a8 3340#ifdef DEBUGGING
8aa23a47 3341 regnode *nxt2;
497b47a8 3342#endif
c277df42 3343
8aa23a47
YO
3344 /* Skip open. */
3345 nxt = regnext(nxt);
e52fc539 3346 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3347 && !(PL_regkind[OP(nxt)] == EXACT
3348 && STR_LEN(nxt) == 1))
3349 goto nogo;
497b47a8 3350#ifdef DEBUGGING
8aa23a47 3351 nxt2 = nxt;
497b47a8 3352#endif
8aa23a47
YO
3353 nxt = regnext(nxt);
3354 if (OP(nxt) != CLOSE)
3355 goto nogo;
3356 if (RExC_open_parens) {
3357 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3358 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3359 }
3360 /* Now we know that nxt2 is the only contents: */
3361 oscan->flags = (U8)ARG(nxt);
3362 OP(oscan) = CURLYN;
3363 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3364
c277df42 3365#ifdef DEBUGGING
8aa23a47 3366 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3367 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3368 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3369 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3370 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3371 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3372#endif
8aa23a47
YO
3373 }
3374 nogo:
3375
3376 /* Try optimization CURLYX => CURLYM. */
3377 if ( OP(oscan) == CURLYX && data
3378 && !(data->flags & SF_HAS_PAR)
3379 && !(data->flags & SF_HAS_EVAL)
3380 && !deltanext /* atom is fixed width */
3381 && minnext != 0 /* CURLYM can't handle zero width */
3382 ) {
3383 /* XXXX How to optimize if data == 0? */
3384 /* Optimize to a simpler form. */
3385 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3386 regnode *nxt2;
3387
3388 OP(oscan) = CURLYM;
3389 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3390 && (OP(nxt2) != WHILEM))
3391 nxt = nxt2;
3392 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3393 /* Need to optimize away parenths. */
b3c0965f 3394 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3395 /* Set the parenth number. */
3396 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3397
8aa23a47
YO
3398 oscan->flags = (U8)ARG(nxt);
3399 if (RExC_open_parens) {
3400 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3401 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3402 }
8aa23a47
YO
3403 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3404 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3405
c277df42 3406#ifdef DEBUGGING
8aa23a47
YO
3407 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3408 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PAR
3409 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3410 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3411#endif
c277df42 3412#if 0
8aa23a47
YO
3413 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3414 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3415 if (nnxt == nxt) {
3416 if (reg_off_by_arg[OP(nxt1)])
3417 ARG_SET(nxt1, nxt2 - nxt1);
3418 else if (nxt2 - nxt1 < U16_MAX)
3419 NEXT_OFF(nxt1) = nxt2 - nxt1;
3420 else
3421 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3422 }
8aa23a47 3423 nxt1 = nnxt;
c277df42 3424 }
5d1c421c 3425#endif
8aa23a47
YO
3426 /* Optimize again: */
3427 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3428 NULL, stopparen, recursed, NULL, 0,depth+1);
3429 }
3430 else
3431 oscan->flags = 0;
3432 }
3433 else if ((OP(oscan) == CURLYX)
3434 && (flags & SCF_WHILEM_VISITED_POS)
3435 /* See the comment on a similar expression above.
3b753521 3436 However, this time it's not a subexpression
8aa23a47
YO
3437 we care about, but the expression itself. */
3438 && (maxcount == REG_INFTY)
3439 && data && ++data->whilem_c < 16) {
3440 /* This stays as CURLYX, we can put the count/of pair. */
3441 /* Find WHILEM (as in regexec.c) */
3442 regnode *nxt = oscan + NEXT_OFF(oscan);
3443
3444 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3445 nxt += ARG(nxt);
3446 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3447 | (RExC_whilem_seen << 4)); /* On WHILEM */
3448 }
3449 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3450 pars++;
3451 if (flags & SCF_DO_SUBSTR) {
3452 SV *last_str = NULL;
3453 int counted = mincount != 0;
a0ed51b3 3454
8aa23a47
YO
3455 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3456#if defined(SPARC64_GCC_WORKAROUND)
3457 I32 b = 0;
3458 STRLEN l = 0;
3459 const char *s = NULL;
3460 I32 old = 0;
b515a41d 3461
8aa23a47
YO
3462 if (pos_before >= data->last_start_min)
3463 b = pos_before;
3464 else
3465 b = data->last_start_min;
b515a41d 3466
8aa23a47
YO
3467 l = 0;
3468 s = SvPV_const(data->last_found, l);
3469 old = b - data->last_start_min;
3470
3471#else
3472 I32 b = pos_before >= data->last_start_min
3473 ? pos_before : data->last_start_min;
3474 STRLEN l;
3475 const char * const s = SvPV_const(data->last_found, l);
3476 I32 old = b - data->last_start_min;
3477#endif
3478
3479 if (UTF)
3480 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3481 l -= old;
3482 /* Get the added string: */
740cce10 3483 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3484 if (deltanext == 0 && pos_before == b) {
3485 /* What was added is a constant string */
3486 if (mincount > 1) {
3487 SvGROW(last_str, (mincount * l) + 1);
3488 repeatcpy(SvPVX(last_str) + l,
3489 SvPVX_const(last_str), l, mincount - 1);
3490 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3491 /* Add additional parts. */
3492 SvCUR_set(data->last_found,
3493 SvCUR(data->last_found) - l);
3494 sv_catsv(data->last_found, last_str);
3495 {
3496 SV * sv = data->last_found;
3497 MAGIC *mg =
3498 SvUTF8(sv) && SvMAGICAL(sv) ?
3499 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3500 if (mg && mg->mg_len >= 0)
bd94e887 3501 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3502 }
8aa23a47 3503 data->last_end += l * (mincount - 1);
b515a41d 3504 }
8aa23a47
YO
3505 } else {
3506 /* start offset must point into the last copy */
3507 data->last_start_min += minnext * (mincount - 1);
3508 data->last_start_max += is_inf ? I32_MAX
3509 : (maxcount - 1) * (minnext + data->pos_delta);
3510 }
c277df42 3511 }
8aa23a47
YO
3512 /* It is counted once already... */
3513 data->pos_min += minnext * (mincount - counted);
3514 data->pos_delta += - counted * deltanext +
3515 (minnext + deltanext) * maxcount - minnext * mincount;
3516 if (mincount != maxcount) {
3517 /* Cannot extend fixed substrings found inside
3518 the group. */
304ee84b 3519 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3520 if (mincount && last_str) {
3521 SV * const sv = data->last_found;
3522 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3523 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3524
3525 if (mg)
3526 mg->mg_len = -1;
3527 sv_setsv(sv, last_str);
3528 data->last_end = data->pos_min;
3529 data->last_start_min =
3530 data->pos_min - CHR_SVLEN(last_str);
3531 data->last_start_max = is_inf
3532 ? I32_MAX
3533 : data->pos_min + data->pos_delta
3534 - CHR_SVLEN(last_str);
3535 }
3536 data->longest = &(data->longest_float);
3537 }
3538 SvREFCNT_dec(last_str);
c277df42 3539 }
8aa23a47
YO
3540 if (data && (fl & SF_HAS_EVAL))
3541 data->flags |= SF_HAS_EVAL;
3542 optimize_curly_tail:
3543 if (OP(oscan) != CURLYX) {
3544 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3545 && NEXT_OFF(next))
3546 NEXT_OFF(oscan) += NEXT_OFF(next);
3547 }
3548 continue;
f56b6394 3549 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3550 if (flags & SCF_DO_SUBSTR) {
304ee84b 3551 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3552 data->longest = &(data->longest_float);
3553 }
3554 is_inf = is_inf_internal = 1;
3555 if (flags & SCF_DO_STCLASS_OR)
3556 cl_anything(pRExC_state, data->start_class);
3557 flags &= ~SCF_DO_STCLASS;
3558 break;
c277df42 3559 }
8aa23a47 3560 }
e1d1eefb
YO
3561 else if (OP(scan) == LNBREAK) {
3562 if (flags & SCF_DO_STCLASS) {
3563 int value = 0;
3564 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3565 if (flags & SCF_DO_STCLASS_AND) {
3566 for (value = 0; value < 256; value++)
e64b1bd1 3567 if (!is_VERTWS_cp(value))
b9a59e08
KW
3568 ANYOF_BITMAP_CLEAR(data->start_class, value);
3569 }
3570 else {
e1d1eefb 3571 for (value = 0; value < 256; value++)
e64b1bd1 3572 if (is_VERTWS_cp(value))
b9a59e08
KW
3573 ANYOF_BITMAP_SET(data->start_class, value);
3574 }
e1d1eefb
YO
3575 if (flags & SCF_DO_STCLASS_OR)
3576 cl_and(data->start_class, and_withp);
3577 flags &= ~SCF_DO_STCLASS;
3578 }
3579 min += 1;
f9a79580 3580 delta += 1;
e1d1eefb
YO
3581 if (flags & SCF_DO_SUBSTR) {
3582 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3583 data->pos_min += 1;
f9a79580 3584 data->pos_delta += 1;
e1d1eefb
YO
3585 data->longest = &(data->longest_float);
3586 }
e1d1eefb 3587 }
f9a79580 3588 else if (OP(scan) == FOLDCHAR) {
ced7f090 3589 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3590 flags &= ~SCF_DO_STCLASS;
3591 min += 1;
3592 delta += d;
3593 if (flags & SCF_DO_SUBSTR) {
3594 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3595 data->pos_min += 1;
3596 data->pos_delta += d;
3597 data->longest = &(data->longest_float);
3598 }
3599 }
e52fc539 3600 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3601 int value = 0;
653099ff 3602
8aa23a47 3603 if (flags & SCF_DO_SUBSTR) {
304ee84b 3604 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3605 data->pos_min++;
3606 }
3607 min++;
3608 if (flags & SCF_DO_STCLASS) {
3609 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3610
8aa23a47
YO
3611 /* Some of the logic below assumes that switching
3612 locale on will only add false positives. */
3613 switch (PL_regkind[OP(scan)]) {
3614 case SANY:
3615 default:
3616 do_default:
3617 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3618 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3619 cl_anything(pRExC_state, data->start_class);
3620 break;
3621 case REG_ANY:
3622 if (OP(scan) == SANY)
3623 goto do_default;
3624 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3625 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3626 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3627 cl_anything(pRExC_state, data->start_class);
653099ff 3628 }
8aa23a47
YO
3629 if (flags & SCF_DO_STCLASS_AND || !value)
3630 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3631 break;
3632 case ANYOF:
3633 if (flags & SCF_DO_STCLASS_AND)
3634 cl_and(data->start_class,
3635 (struct regnode_charclass_class*)scan);
653099ff 3636 else
8aa23a47
YO
3637 cl_or(pRExC_state, data->start_class,
3638 (struct regnode_charclass_class*)scan);
3639 break;
3640 case ALNUM:
3641 if (flags & SCF_DO_STCLASS_AND) {
3642 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3643 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 3644 if (OP(scan) == ALNUMU) {
a12cf05f
KW
3645 for (value = 0; value < 256; value++) {
3646 if (!isWORDCHAR_L1(value)) {
3647 ANYOF_BITMAP_CLEAR(data->start_class, value);
3648 }
3649 }
3650 } else {
3651 for (value = 0; value < 256; value++) {
3652 if (!isALNUM(value)) {
3653 ANYOF_BITMAP_CLEAR(data->start_class, value);
3654 }
3655 }
3656 }
8aa23a47 3657 }
653099ff 3658 }
8aa23a47
YO
3659 else {
3660 if (data->start_class->flags & ANYOF_LOCALE)
3661 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
980866de 3662 else if (OP(scan) == ALNUMU) {
a12cf05f
KW
3663 for (value = 0; value < 256; value++) {
3664 if (isWORDCHAR_L1(value)) {
3665 ANYOF_BITMAP_SET(data->start_class, value);
3666 }
3667 }
3668 } else {
3669 for (value = 0; value < 256; value++) {
3670 if (isALNUM(value)) {
3671 ANYOF_BITMAP_SET(data->start_class, value);
3672 }
3673 }
3674 }
8aa23a47
YO
3675 }
3676 break;
8aa23a47
YO
3677 case NALNUM:
3678 if (flags & SCF_DO_STCLASS_AND) {
3679 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3680 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 3681 if (OP(scan) == NALNUMU) {
a12cf05f
KW
3682 for (value = 0; value < 256; value++) {
3683 if (isWORDCHAR_L1(value)) {
3684</