This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: add missing code for optimizer for \W
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d
AD
88#include "dquote_static.c"
89
d4cce5f1 90#ifdef op
11343788 91#undef op
d4cce5f1 92#endif /* op */
11343788 93
fe14fcc3 94#ifdef MSDOS
7e4e8c89 95# if defined(BUGGY_MSC6)
fe14fcc3 96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 97# pragma optimize("a",off)
fe14fcc3 98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
fe14fcc3
LW
101#endif /* MSDOS */
102
a687059c
LW
103#ifndef STATIC
104#define STATIC static
105#endif
106
830247a4 107typedef struct RExC_state_t {
e2509266 108 U32 flags; /* are we folding, multilining? */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 113 char *start; /* Start of input for compile */
830247a4
IZ
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 117 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 118 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
c74340f9
YO
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 126 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
81714fb9 137 HV *paren_names; /* Paren names */
1f1031fe 138
40d049e4
YO
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
3dab1dad 145#ifdef DEBUGGING
be8e71aa 146 const char *lastparse;
3dab1dad 147 I32 lastnum;
1f1031fe 148 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 151#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 152#endif
830247a4
IZ
153} RExC_state_t;
154
e2509266 155#define RExC_flags (pRExC_state->flags)
830247a4 156#define RExC_precomp (pRExC_state->precomp)
288b8c02 157#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 158#define RExC_rx (pRExC_state->rx)
f8fc2ecf 159#define RExC_rxi (pRExC_state->rxi)
fac92740 160#define RExC_start (pRExC_state->start)
830247a4
IZ
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
830247a4 167#define RExC_emit (pRExC_state->emit)
fac92740 168#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 169#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
e2e6a0f1 175#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 179#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3 200#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
204#define SIMPLE 0x02
a3b492c3
YO
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 208
3dab1dad
YO
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
07be1b83
YO
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
786e8c11 214#define FULL_TRIE_STUDY
07be1b83
YO
215#define TRIE_STCLASS
216#endif
1de06328
YO
217
218
40d049e4
YO
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
bbd61b5f
KW
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
40d049e4 231
1de06328
YO
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
3b753521 238 string that must appear at a fixed location, and we look for the
1de06328
YO
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
1de06328
YO
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
3b753521 266 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
486ec47a 283 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
2c2d71f5
JH
306
307typedef struct scan_data_t {
1de06328
YO
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
2c2d71f5
JH
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
1de06328 313 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
314 I32 last_start_min;
315 I32 last_start_max;
1de06328
YO
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
486ec47a 319 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 324 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 325 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
326 I32 flags;
327 I32 whilem_c;
cb434fcc 328 I32 *last_closep;
653099ff 329 struct regnode_charclass_class *start_class;
2c2d71f5
JH
330} scan_data_t;
331
a687059c 332/*
e50aee73 333 * Forward declarations for pregcomp()'s friends.
a687059c 334 */
a0d0e21e 335
27da23d5 336static const scan_data_t zero_scan_data =
1de06328 337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
09b7f37c
CB
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
c277df42
IZ
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
653099ff
GS
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 366#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 367
786e8c11 368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 369#define SCF_SEEN_ACCEPT 0x8000
07be1b83 370
43fead97 371#define UTF cBOOL(RExC_utf8)
a62b1201
KW
372#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
373#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
374
43fead97 375#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 376
ffc61ed2 377#define OOB_UNICODE 12345678
93733859 378#define OOB_NAMEDCLASS -1
b8c5462f 379
a0ed51b3
LW
380#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
381#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
382
8615cb43 383
b45f050a
JF
384/* length of regex to show in messages that don't mark a position within */
385#define RegexLengthToShowInErrorMessages 127
386
387/*
388 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
389 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
390 * op/pragma/warn/regcomp.
391 */
7253e4e3
RK
392#define MARKER1 "<-- HERE" /* marker as it appears in the description */
393#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 394
7253e4e3 395#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
396
397/*
398 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
399 * arg. Show regex, up to a maximum length. If it's too long, chop and add
400 * "...".
401 */
58e23c8d 402#define _FAIL(code) STMT_START { \
bfed75c6 403 const char *ellipses = ""; \
ccb2c380
MP
404 IV len = RExC_end - RExC_precomp; \
405 \
406 if (!SIZE_ONLY) \
288b8c02 407 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
408 if (len > RegexLengthToShowInErrorMessages) { \
409 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
410 len = RegexLengthToShowInErrorMessages - 10; \
411 ellipses = "..."; \
412 } \
58e23c8d 413 code; \
ccb2c380 414} STMT_END
8615cb43 415
58e23c8d
YO
416#define FAIL(msg) _FAIL( \
417 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
418 msg, (int)len, RExC_precomp, ellipses))
419
420#define FAIL2(msg,arg) _FAIL( \
421 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
422 arg, (int)len, RExC_precomp, ellipses))
423
b45f050a 424/*
b45f050a
JF
425 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
426 */
ccb2c380 427#define Simple_vFAIL(m) STMT_START { \
a28509cc 428 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
429 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
430 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
431} STMT_END
b45f050a
JF
432
433/*
434 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
435 */
ccb2c380
MP
436#define vFAIL(m) STMT_START { \
437 if (!SIZE_ONLY) \
288b8c02 438 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
439 Simple_vFAIL(m); \
440} STMT_END
b45f050a
JF
441
442/*
443 * Like Simple_vFAIL(), but accepts two arguments.
444 */
ccb2c380 445#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 446 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
447 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
448 (int)offset, RExC_precomp, RExC_precomp + offset); \
449} STMT_END
b45f050a
JF
450
451/*
452 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
453 */
ccb2c380
MP
454#define vFAIL2(m,a1) STMT_START { \
455 if (!SIZE_ONLY) \
288b8c02 456 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
457 Simple_vFAIL2(m, a1); \
458} STMT_END
b45f050a
JF
459
460
461/*
462 * Like Simple_vFAIL(), but accepts three arguments.
463 */
ccb2c380 464#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 465 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
466 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
467 (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
b45f050a
JF
469
470/*
471 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
472 */
ccb2c380
MP
473#define vFAIL3(m,a1,a2) STMT_START { \
474 if (!SIZE_ONLY) \
288b8c02 475 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
476 Simple_vFAIL3(m, a1, a2); \
477} STMT_END
b45f050a
JF
478
479/*
480 * Like Simple_vFAIL(), but accepts four arguments.
481 */
ccb2c380 482#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 483 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
484 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
485 (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
b45f050a 487
668c081a 488#define ckWARNreg(loc,m) STMT_START { \
a28509cc 489 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
490 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
491 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
492} STMT_END
493
668c081a 494#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 495 const IV offset = loc - RExC_precomp; \
d1d15184 496 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
497 m REPORT_LOCATION, \
498 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
499} STMT_END
500
668c081a 501#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 502 const IV offset = loc - RExC_precomp; \
668c081a 503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
504 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
505} STMT_END
506
507#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
511} STMT_END
512
668c081a
NC
513#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
517} STMT_END
518
ccb2c380 519#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 520 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
523} STMT_END
524
668c081a
NC
525#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
526 const IV offset = loc - RExC_precomp; \
527 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
528 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
529} STMT_END
530
ccb2c380 531#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 532 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
533 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
534 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
535} STMT_END
9d1d55b5 536
8615cb43 537
cd439c50 538/* Allow for side effects in s */
ccb2c380
MP
539#define REGC(c,s) STMT_START { \
540 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
541} STMT_END
cd439c50 542
fac92740
MJD
543/* Macros for recording node offsets. 20001227 mjd@plover.com
544 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
545 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
546 * Element 0 holds the number n.
07be1b83 547 * Position is 1 indexed.
fac92740 548 */
7122b237
YO
549#ifndef RE_TRACK_PATTERN_OFFSETS
550#define Set_Node_Offset_To_R(node,byte)
551#define Set_Node_Offset(node,byte)
552#define Set_Cur_Node_Offset
553#define Set_Node_Length_To_R(node,len)
554#define Set_Node_Length(node,len)
555#define Set_Node_Cur_Length(node)
556#define Node_Offset(n)
557#define Node_Length(n)
558#define Set_Node_Offset_Length(node,offset,len)
559#define ProgLen(ri) ri->u.proglen
560#define SetProgLen(ri,x) ri->u.proglen = x
561#else
562#define ProgLen(ri) ri->u.offsets[0]
563#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
564#define Set_Node_Offset_To_R(node,byte) STMT_START { \
565 if (! SIZE_ONLY) { \
566 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 567 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 568 if((node) < 0) { \
551405c4 569 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
570 } else { \
571 RExC_offsets[2*(node)-1] = (byte); \
572 } \
573 } \
574} STMT_END
575
576#define Set_Node_Offset(node,byte) \
577 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
578#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
579
580#define Set_Node_Length_To_R(node,len) STMT_START { \
581 if (! SIZE_ONLY) { \
582 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 583 __LINE__, (int)(node), (int)(len))); \
ccb2c380 584 if((node) < 0) { \
551405c4 585 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
586 } else { \
587 RExC_offsets[2*(node)] = (len); \
588 } \
589 } \
590} STMT_END
591
592#define Set_Node_Length(node,len) \
593 Set_Node_Length_To_R((node)-RExC_emit_start, len)
594#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
595#define Set_Node_Cur_Length(node) \
596 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
597
598/* Get offsets and lengths */
599#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
600#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
601
07be1b83
YO
602#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
603 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
604 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
605} STMT_END
7122b237 606#endif
07be1b83
YO
607
608#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
609#define EXPERIMENTAL_INPLACESCAN
f427392e 610#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 611
304ee84b
YO
612#define DEBUG_STUDYDATA(str,data,depth) \
613DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 614 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
615 "%*s" str "Pos:%"IVdf"/%"IVdf \
616 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
617 (int)(depth)*2, "", \
618 (IV)((data)->pos_min), \
619 (IV)((data)->pos_delta), \
304ee84b 620 (UV)((data)->flags), \
1de06328 621 (IV)((data)->whilem_c), \
304ee84b
YO
622 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
623 is_inf ? "INF " : "" \
1de06328
YO
624 ); \
625 if ((data)->last_found) \
626 PerlIO_printf(Perl_debug_log, \
627 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
628 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
629 SvPVX_const((data)->last_found), \
630 (IV)((data)->last_end), \
631 (IV)((data)->last_start_min), \
632 (IV)((data)->last_start_max), \
633 ((data)->longest && \
634 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
635 SvPVX_const((data)->longest_fixed), \
636 (IV)((data)->offset_fixed), \
637 ((data)->longest && \
638 (data)->longest==&((data)->longest_float)) ? "*" : "", \
639 SvPVX_const((data)->longest_float), \
640 (IV)((data)->offset_float_min), \
641 (IV)((data)->offset_float_max) \
642 ); \
643 PerlIO_printf(Perl_debug_log,"\n"); \
644});
645
acfe0abc 646static void clear_re(pTHX_ void *r);
4327152a 647
653099ff 648/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 649 Update the longest found anchored substring and the longest found
653099ff
GS
650 floating substrings if needed. */
651
4327152a 652STATIC void
304ee84b 653S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 654{
e1ec3a88
AL
655 const STRLEN l = CHR_SVLEN(data->last_found);
656 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 657 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 658
7918f24d
NC
659 PERL_ARGS_ASSERT_SCAN_COMMIT;
660
c277df42 661 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 662 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
663 if (*data->longest == data->longest_fixed) {
664 data->offset_fixed = l ? data->last_start_min : data->pos_min;
665 if (data->flags & SF_BEFORE_EOL)
b81d288d 666 data->flags
c277df42
IZ
667 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
668 else
669 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
670 data->minlen_fixed=minlenp;
671 data->lookbehind_fixed=0;
a0ed51b3 672 }
304ee84b 673 else { /* *data->longest == data->longest_float */
c277df42 674 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
675 data->offset_float_max = (l
676 ? data->last_start_max
c277df42 677 : data->pos_min + data->pos_delta);
304ee84b 678 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 679 data->offset_float_max = I32_MAX;
c277df42 680 if (data->flags & SF_BEFORE_EOL)
b81d288d 681 data->flags
c277df42
IZ
682 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
683 else
684 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
685 data->minlen_float=minlenp;
686 data->lookbehind_float=0;
c277df42
IZ
687 }
688 }
689 SvCUR_set(data->last_found, 0);
0eda9292 690 {
a28509cc 691 SV * const sv = data->last_found;
097eb12c
AL
692 if (SvUTF8(sv) && SvMAGICAL(sv)) {
693 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
694 if (mg)
695 mg->mg_len = 0;
696 }
0eda9292 697 }
c277df42
IZ
698 data->last_end = -1;
699 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 700 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
701}
702
653099ff
GS
703/* Can match anything (initialization) */
704STATIC void
097eb12c 705S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 706{
7918f24d
NC
707 PERL_ARGS_ASSERT_CL_ANYTHING;
708
653099ff 709 ANYOF_CLASS_ZERO(cl);
f8bef550 710 ANYOF_BITMAP_SETALL(cl);
11454c59 711 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
653099ff
GS
712 if (LOC)
713 cl->flags |= ANYOF_LOCALE;
714}
715
716/* Can match anything (initialization) */
717STATIC int
5f66b61c 718S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
719{
720 int value;
721
7918f24d
NC
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
723
aaa51d5e 724 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
726 return 1;
1aa99e6b
IH
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
728 return 0;
10edeb5d 729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 730 return 0;
653099ff
GS
731 return 1;
732}
733
734/* Can match anything (initialization) */
735STATIC void
097eb12c 736S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 737{
7918f24d
NC
738 PERL_ARGS_ASSERT_CL_INIT;
739
8ecf7187 740 Zero(cl, 1, struct regnode_charclass_class);
653099ff 741 cl->type = ANYOF;
830247a4 742 cl_anything(pRExC_state, cl);
653099ff
GS
743}
744
745STATIC void
097eb12c 746S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 747{
7918f24d
NC
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
749
8ecf7187 750 Zero(cl, 1, struct regnode_charclass_class);
653099ff 751 cl->type = ANYOF;
830247a4 752 cl_anything(pRExC_state, cl);
653099ff
GS
753 if (LOC)
754 cl->flags |= ANYOF_LOCALE;
755}
756
757/* 'And' a given class with another one. Can create false positives */
758/* We assume that cl is not inverted */
759STATIC void
5f66b61c 760S_cl_and(struct regnode_charclass_class *cl,
a28509cc 761 const struct regnode_charclass_class *and_with)
653099ff 762{
7918f24d 763 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
764
765 assert(and_with->type == ANYOF);
1e6ade67
KW
766
767 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
768 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 769 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
770 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
771 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
772 int i;
773
774 if (and_with->flags & ANYOF_INVERT)
775 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
776 cl->bitmap[i] &= ~and_with->bitmap[i];
777 else
778 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
779 cl->bitmap[i] &= and_with->bitmap[i];
780 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
781 if (!(and_with->flags & ANYOF_EOS))
782 cl->flags &= ~ANYOF_EOS;
1aa99e6b 783
39065660
KW
784 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
785 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
11454c59
KW
786 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
787 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
8951c461 788
3ff7ceb3 789 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
14ebb1a2 790 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b 791 cl->flags &= ~ANYOF_UNICODE_ALL;
ef87b810
KW
792 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
793 only the one(s)
794 actually set */
1aa99e6b
IH
795 ARG_SET(cl, ARG(and_with));
796 }
14ebb1a2
JH
797 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
798 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 799 cl->flags &= ~ANYOF_UNICODE_ALL;
3ff7ceb3 800 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
14ebb1a2 801 !(and_with->flags & ANYOF_INVERT))
3ff7ceb3 802 cl->flags &= ~ANYOF_NONBITMAP;
653099ff
GS
803}
804
805/* 'OR' a given class with another one. Can create false positives */
806/* We assume that cl is not inverted */
807STATIC void
097eb12c 808S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 809{
7918f24d
NC
810 PERL_ARGS_ASSERT_CL_OR;
811
653099ff
GS
812 if (or_with->flags & ANYOF_INVERT) {
813 /* We do not use
814 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
815 * <= (B1 | !B2) | (CL1 | !CL2)
816 * which is wasteful if CL2 is small, but we ignore CL2:
817 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
818 * XXXX Can we handle case-fold? Unclear:
819 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
820 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
821 */
822 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
823 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
824 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
825 int i;
826
827 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
828 cl->bitmap[i] |= ~or_with->bitmap[i];
829 } /* XXXX: logic is complicated otherwise */
830 else {
830247a4 831 cl_anything(pRExC_state, cl);
653099ff
GS
832 }
833 } else {
834 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
835 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
836 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
837 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
838 int i;
839
840 /* OR char bitmap and class bitmap separately */
841 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
842 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 843 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
844 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
845 cl->classflags[i] |= or_with->classflags[i];
846 cl->flags |= ANYOF_CLASS;
847 }
848 }
849 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 850 cl_anything(pRExC_state, cl);
653099ff
GS
851 }
852 }
853 if (or_with->flags & ANYOF_EOS)
854 cl->flags |= ANYOF_EOS;
11454c59
KW
855 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
856 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
1aa99e6b 857
39065660
KW
858 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
859 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
8951c461 860
9826f543
KW
861 /* If both nodes match something outside the bitmap, but what they match
862 * outside is not the same pointer, and hence not easily compared, give up
863 * and allow the start class to match everything outside the bitmap */
3ff7ceb3 864 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
1aa99e6b
IH
865 ARG(cl) != ARG(or_with)) {
866 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 867 }
9826f543 868
1aa99e6b
IH
869 if (or_with->flags & ANYOF_UNICODE_ALL) {
870 cl->flags |= ANYOF_UNICODE_ALL;
1aa99e6b 871 }
653099ff
GS
872}
873
a3621e74
YO
874#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
875#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
876#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
877#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
878
3dab1dad
YO
879
880#ifdef DEBUGGING
07be1b83 881/*
2b8b4781
NC
882 dump_trie(trie,widecharmap,revcharmap)
883 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
884 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
885
886 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
887 The _interim_ variants are used for debugging the interim
888 tables that are used to generate the final compressed
889 representation which is what dump_trie expects.
890
486ec47a 891 Part of the reason for their existence is to provide a form
3dab1dad 892 of documentation as to how the different representations function.
07be1b83
YO
893
894*/
3dab1dad
YO
895
896/*
3dab1dad
YO
897 Dumps the final compressed table form of the trie to Perl_debug_log.
898 Used for debugging make_trie().
899*/
b9a59e08 900
3dab1dad 901STATIC void
2b8b4781
NC
902S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
903 AV *revcharmap, U32 depth)
3dab1dad
YO
904{
905 U32 state;
ab3bbdeb 906 SV *sv=sv_newmortal();
55eed653 907 int colwidth= widecharmap ? 6 : 4;
2e64971a 908 U16 word;
3dab1dad
YO
909 GET_RE_DEBUG_FLAGS_DECL;
910
7918f24d 911 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 912
3dab1dad
YO
913 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
914 (int)depth * 2 + 2,"",
915 "Match","Base","Ofs" );
916
917 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 918 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 919 if ( tmp ) {
ab3bbdeb
YO
920 PerlIO_printf( Perl_debug_log, "%*s",
921 colwidth,
ddc5bc0f 922 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
923 PL_colors[0], PL_colors[1],
924 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
925 PERL_PV_ESCAPE_FIRSTCHAR
926 )
927 );
3dab1dad
YO
928 }
929 }
930 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
931 (int)depth * 2 + 2,"");
932
933 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 934 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
935 PerlIO_printf( Perl_debug_log, "\n");
936
1e2e3d02 937 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 938 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
939
940 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
941
942 if ( trie->states[ state ].wordnum ) {
943 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
944 } else {
945 PerlIO_printf( Perl_debug_log, "%6s", "" );
946 }
947
948 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
949
950 if ( base ) {
951 U32 ofs = 0;
952
953 while( ( base + ofs < trie->uniquecharcount ) ||
954 ( base + ofs - trie->uniquecharcount < trie->lasttrans
955 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
956 ofs++;
957
958 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
959
960 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
961 if ( ( base + ofs >= trie->uniquecharcount ) &&
962 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
963 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
964 {
ab3bbdeb
YO
965 PerlIO_printf( Perl_debug_log, "%*"UVXf,
966 colwidth,
3dab1dad
YO
967 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
968 } else {
ab3bbdeb 969 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
970 }
971 }
972
973 PerlIO_printf( Perl_debug_log, "]");
974
975 }
976 PerlIO_printf( Perl_debug_log, "\n" );
977 }
2e64971a
DM
978 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
979 for (word=1; word <= trie->wordcount; word++) {
980 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
981 (int)word, (int)(trie->wordinfo[word].prev),
982 (int)(trie->wordinfo[word].len));
983 }
984 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
985}
986/*
3dab1dad
YO
987 Dumps a fully constructed but uncompressed trie in list form.
988 List tries normally only are used for construction when the number of
989 possible chars (trie->uniquecharcount) is very high.
990 Used for debugging make_trie().
991*/
992STATIC void
55eed653 993S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
994 HV *widecharmap, AV *revcharmap, U32 next_alloc,
995 U32 depth)
3dab1dad
YO
996{
997 U32 state;
ab3bbdeb 998 SV *sv=sv_newmortal();
55eed653 999 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1000 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1001
1002 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1003
3dab1dad 1004 /* print out the table precompression. */
ab3bbdeb
YO
1005 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1006 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1007 "------:-----+-----------------\n" );
3dab1dad
YO
1008
1009 for( state=1 ; state < next_alloc ; state ++ ) {
1010 U16 charid;
1011
ab3bbdeb 1012 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1013 (int)depth * 2 + 2,"", (UV)state );
1014 if ( ! trie->states[ state ].wordnum ) {
1015 PerlIO_printf( Perl_debug_log, "%5s| ","");
1016 } else {
1017 PerlIO_printf( Perl_debug_log, "W%4x| ",
1018 trie->states[ state ].wordnum
1019 );
1020 }
1021 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1022 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1023 if ( tmp ) {
1024 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1025 colwidth,
ddc5bc0f 1026 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1027 PL_colors[0], PL_colors[1],
1028 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1029 PERL_PV_ESCAPE_FIRSTCHAR
1030 ) ,
1e2e3d02
YO
1031 TRIE_LIST_ITEM(state,charid).forid,
1032 (UV)TRIE_LIST_ITEM(state,charid).newstate
1033 );
1034 if (!(charid % 10))
664e119d
RGS
1035 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1036 (int)((depth * 2) + 14), "");
1e2e3d02 1037 }
ab3bbdeb
YO
1038 }
1039 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1040 }
1041}
1042
1043/*
3dab1dad
YO
1044 Dumps a fully constructed but uncompressed trie in table form.
1045 This is the normal DFA style state transition table, with a few
1046 twists to facilitate compression later.
1047 Used for debugging make_trie().
1048*/
1049STATIC void
55eed653 1050S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1051 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1052 U32 depth)
3dab1dad
YO
1053{
1054 U32 state;
1055 U16 charid;
ab3bbdeb 1056 SV *sv=sv_newmortal();
55eed653 1057 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1058 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1059
1060 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1061
1062 /*
1063 print out the table precompression so that we can do a visual check
1064 that they are identical.
1065 */
1066
1067 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1068
1069 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1070 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1071 if ( tmp ) {
ab3bbdeb
YO
1072 PerlIO_printf( Perl_debug_log, "%*s",
1073 colwidth,
ddc5bc0f 1074 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1075 PL_colors[0], PL_colors[1],
1076 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1077 PERL_PV_ESCAPE_FIRSTCHAR
1078 )
1079 );
3dab1dad
YO
1080 }
1081 }
1082
1083 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1084
1085 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1086 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1087 }
1088
1089 PerlIO_printf( Perl_debug_log, "\n" );
1090
1091 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1092
1093 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1094 (int)depth * 2 + 2,"",
1095 (UV)TRIE_NODENUM( state ) );
1096
1097 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1098 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1099 if (v)
1100 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1101 else
1102 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1103 }
1104 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1105 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1106 } else {
1107 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1108 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1109 }
1110 }
07be1b83 1111}
3dab1dad
YO
1112
1113#endif
1114
2e64971a 1115
786e8c11
YO
1116/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1117 startbranch: the first branch in the whole branch sequence
1118 first : start branch of sequence of branch-exact nodes.
1119 May be the same as startbranch
1120 last : Thing following the last branch.
1121 May be the same as tail.
1122 tail : item following the branch sequence
1123 count : words in the sequence
1124 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1125 depth : indent depth
3dab1dad 1126
786e8c11 1127Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1128
786e8c11
YO
1129A trie is an N'ary tree where the branches are determined by digital
1130decomposition of the key. IE, at the root node you look up the 1st character and
1131follow that branch repeat until you find the end of the branches. Nodes can be
1132marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1133
786e8c11 1134 /he|she|his|hers/
72f13be8 1135
786e8c11
YO
1136would convert into the following structure. Numbers represent states, letters
1137following numbers represent valid transitions on the letter from that state, if
1138the number is in square brackets it represents an accepting state, otherwise it
1139will be in parenthesis.
07be1b83 1140
786e8c11
YO
1141 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1142 | |
1143 | (2)
1144 | |
1145 (1) +-i->(6)-+-s->[7]
1146 |
1147 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1148
786e8c11
YO
1149 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1150
1151This shows that when matching against the string 'hers' we will begin at state 1
1152read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1153then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1154is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1155single traverse. We store a mapping from accepting to state to which word was
1156matched, and then when we have multiple possibilities we try to complete the
1157rest of the regex in the order in which they occured in the alternation.
1158
1159The only prior NFA like behaviour that would be changed by the TRIE support is
1160the silent ignoring of duplicate alternations which are of the form:
1161
1162 / (DUPE|DUPE) X? (?{ ... }) Y /x
1163
4b714af6 1164Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1165and without the optimisation. With the optimisations dupes will be silently
486ec47a 1166ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1167the following demonstrates:
1168
1169 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1170
1171which prints out 'word' three times, but
1172
1173 'words'=~/(word|word|word)(?{ print $1 })S/
1174
1175which doesnt print it out at all. This is due to other optimisations kicking in.
1176
1177Example of what happens on a structural level:
1178
486ec47a 1179The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1180
1181 1: CURLYM[1] {1,32767}(18)
1182 5: BRANCH(8)
1183 6: EXACT <ac>(16)
1184 8: BRANCH(11)
1185 9: EXACT <ad>(16)
1186 11: BRANCH(14)
1187 12: EXACT <ab>(16)
1188 16: SUCCEED(0)
1189 17: NOTHING(18)
1190 18: END(0)
1191
1192This would be optimizable with startbranch=5, first=5, last=16, tail=16
1193and should turn into:
1194
1195 1: CURLYM[1] {1,32767}(18)
1196 5: TRIE(16)
1197 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1198 <ac>
1199 <ad>
1200 <ab>
1201 16: SUCCEED(0)
1202 17: NOTHING(18)
1203 18: END(0)
1204
1205Cases where tail != last would be like /(?foo|bar)baz/:
1206
1207 1: BRANCH(4)
1208 2: EXACT <foo>(8)
1209 4: BRANCH(7)
1210 5: EXACT <bar>(8)
1211 7: TAIL(8)
1212 8: EXACT <baz>(10)
1213 10: END(0)
1214
1215which would be optimizable with startbranch=1, first=1, last=7, tail=8
1216and would end up looking like:
1217
1218 1: TRIE(8)
1219 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1220 <foo>
1221 <bar>
1222 7: TAIL(8)
1223 8: EXACT <baz>(10)
1224 10: END(0)
1225
1226 d = uvuni_to_utf8_flags(d, uv, 0);
1227
1228is the recommended Unicode-aware way of saying
1229
1230 *(d++) = uv;
1231*/
1232
1e2e3d02 1233#define TRIE_STORE_REVCHAR \
786e8c11 1234 STMT_START { \
73031816
NC
1235 if (UTF) { \
1236 SV *zlopp = newSV(2); \
88c9ea1e
CB
1237 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1238 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1239 SvCUR_set(zlopp, kapow - flrbbbbb); \
1240 SvPOK_on(zlopp); \
1241 SvUTF8_on(zlopp); \
1242 av_push(revcharmap, zlopp); \
1243 } else { \
6bdeddd2 1244 char ooooff = (char)uvc; \
73031816
NC
1245 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1246 } \
1247 } STMT_END
786e8c11
YO
1248
1249#define TRIE_READ_CHAR STMT_START { \
1250 wordlen++; \
1251 if ( UTF ) { \
1252 if ( folder ) { \
1253 if ( foldlen > 0 ) { \
1254 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1255 foldlen -= len; \
1256 scan += len; \
1257 len = 0; \
1258 } else { \
1259 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1260 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1261 foldlen -= UNISKIP( uvc ); \
1262 scan = foldbuf + UNISKIP( uvc ); \
1263 } \
1264 } else { \
1265 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1266 } \
1267 } else { \
1268 uvc = (U32)*uc; \
1269 len = 1; \
1270 } \
1271} STMT_END
1272
1273
1274
1275#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1276 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1277 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1278 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1279 } \
1280 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1281 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1282 TRIE_LIST_CUR( state )++; \
1283} STMT_END
07be1b83 1284
786e8c11
YO
1285#define TRIE_LIST_NEW(state) STMT_START { \
1286 Newxz( trie->states[ state ].trans.list, \
1287 4, reg_trie_trans_le ); \
1288 TRIE_LIST_CUR( state ) = 1; \
1289 TRIE_LIST_LEN( state ) = 4; \
1290} STMT_END
07be1b83 1291
786e8c11
YO
1292#define TRIE_HANDLE_WORD(state) STMT_START { \
1293 U16 dupe= trie->states[ state ].wordnum; \
1294 regnode * const noper_next = regnext( noper ); \
1295 \
786e8c11
YO
1296 DEBUG_r({ \
1297 /* store the word for dumping */ \
1298 SV* tmp; \
1299 if (OP(noper) != NOTHING) \
740cce10 1300 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1301 else \
740cce10 1302 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1303 av_push( trie_words, tmp ); \
786e8c11
YO
1304 }); \
1305 \
1306 curword++; \
2e64971a
DM
1307 trie->wordinfo[curword].prev = 0; \
1308 trie->wordinfo[curword].len = wordlen; \
1309 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1310 \
1311 if ( noper_next < tail ) { \
1312 if (!trie->jump) \
c944940b 1313 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1314 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1315 if (!jumper) \
1316 jumper = noper_next; \
1317 if (!nextbranch) \
1318 nextbranch= regnext(cur); \
1319 } \
1320 \
1321 if ( dupe ) { \
2e64971a
DM
1322 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1323 /* chain, so that when the bits of chain are later */\
1324 /* linked together, the dups appear in the chain */\
1325 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1326 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1327 } else { \
1328 /* we haven't inserted this word yet. */ \
1329 trie->states[ state ].wordnum = curword; \
1330 } \
1331} STMT_END
07be1b83 1332
3dab1dad 1333
786e8c11
YO
1334#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1335 ( ( base + charid >= ucharcount \
1336 && base + charid < ubound \
1337 && state == trie->trans[ base - ucharcount + charid ].check \
1338 && trie->trans[ base - ucharcount + charid ].next ) \
1339 ? trie->trans[ base - ucharcount + charid ].next \
1340 : ( state==1 ? special : 0 ) \
1341 )
3dab1dad 1342
786e8c11
YO
1343#define MADE_TRIE 1
1344#define MADE_JUMP_TRIE 2
1345#define MADE_EXACT_TRIE 4
3dab1dad 1346
a3621e74 1347STATIC I32
786e8c11 1348S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1349{
27da23d5 1350 dVAR;
a3621e74
YO
1351 /* first pass, loop through and scan words */
1352 reg_trie_data *trie;
55eed653 1353 HV *widecharmap = NULL;
2b8b4781 1354 AV *revcharmap = newAV();
a3621e74 1355 regnode *cur;
9f7f3913 1356 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1357 STRLEN len = 0;
1358 UV uvc = 0;
1359 U16 curword = 0;
1360 U32 next_alloc = 0;
786e8c11
YO
1361 regnode *jumper = NULL;
1362 regnode *nextbranch = NULL;
7f69552c 1363 regnode *convert = NULL;
2e64971a 1364 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1365 /* we just use folder as a flag in utf8 */
1e696034 1366 const U8 * folder = NULL;
a3621e74 1367
2b8b4781
NC
1368#ifdef DEBUGGING
1369 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1370 AV *trie_words = NULL;
1371 /* along with revcharmap, this only used during construction but both are
1372 * useful during debugging so we store them in the struct when debugging.
8e11feef 1373 */
2b8b4781
NC
1374#else
1375 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1376 STRLEN trie_charcount=0;
3dab1dad 1377#endif
2b8b4781 1378 SV *re_trie_maxbuff;
a3621e74 1379 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1380
1381 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1382#ifndef DEBUGGING
1383 PERL_UNUSED_ARG(depth);
1384#endif
a3621e74 1385
1e696034
KW
1386 switch (flags) {
1387 case EXACTFU: folder = PL_fold_latin1; break;
1388 case EXACTF: folder = PL_fold; break;
1389 case EXACTFL: folder = PL_fold_locale; break;
1390 }
1391
c944940b 1392 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1393 trie->refcount = 1;
3dab1dad 1394 trie->startstate = 1;
786e8c11 1395 trie->wordcount = word_count;
f8fc2ecf 1396 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1397 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1398 if (!(UTF && folder))
c944940b 1399 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1400 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1401 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1402
a3621e74 1403 DEBUG_r({
2b8b4781 1404 trie_words = newAV();
a3621e74 1405 });
a3621e74 1406
0111c4fd 1407 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1408 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1409 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1410 }
3dab1dad
YO
1411 DEBUG_OPTIMISE_r({
1412 PerlIO_printf( Perl_debug_log,
786e8c11 1413 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1414 (int)depth * 2 + 2, "",
1415 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1416 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1417 (int)depth);
3dab1dad 1418 });
7f69552c
YO
1419
1420 /* Find the node we are going to overwrite */
1421 if ( first == startbranch && OP( last ) != BRANCH ) {
1422 /* whole branch chain */
1423 convert = first;
1424 } else {
1425 /* branch sub-chain */
1426 convert = NEXTOPER( first );
1427 }
1428
a3621e74
YO
1429 /* -- First loop and Setup --
1430
1431 We first traverse the branches and scan each word to determine if it
1432 contains widechars, and how many unique chars there are, this is
1433 important as we have to build a table with at least as many columns as we
1434 have unique chars.
1435
1436 We use an array of integers to represent the character codes 0..255
38a44b82 1437 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1438 native representation of the character value as the key and IV's for the
1439 coded index.
1440
1441 *TODO* If we keep track of how many times each character is used we can
1442 remap the columns so that the table compression later on is more
3b753521 1443 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1444 middle and the least common are on the outside. IMO this would be better
1445 than a most to least common mapping as theres a decent chance the most
1446 common letter will share a node with the least common, meaning the node
486ec47a 1447 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1448 case is when we have the least common nodes twice.
1449
1450 */
1451
a3621e74 1452 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1453 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1454 const U8 *uc = (U8*)STRING( noper );
a28509cc 1455 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1456 STRLEN foldlen = 0;
1457 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1458 const U8 *scan = (U8*)NULL;
07be1b83 1459 U32 wordlen = 0; /* required init */
02daf0ab
YO
1460 STRLEN chars = 0;
1461 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1462
3dab1dad
YO
1463 if (OP(noper) == NOTHING) {
1464 trie->minlen= 0;
1465 continue;
1466 }
02daf0ab
YO
1467 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1468 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1469 regardless of encoding */
1470
a3621e74 1471 for ( ; uc < e ; uc += len ) {
3dab1dad 1472 TRIE_CHARCOUNT(trie)++;
a3621e74 1473 TRIE_READ_CHAR;
3dab1dad 1474 chars++;
a3621e74
YO
1475 if ( uvc < 256 ) {
1476 if ( !trie->charmap[ uvc ] ) {
1477 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1478 if ( folder )
1479 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1480 TRIE_STORE_REVCHAR;
a3621e74 1481 }
02daf0ab 1482 if ( set_bit ) {
62012aee
KW
1483 /* store the codepoint in the bitmap, and its folded
1484 * equivalent. */
02daf0ab 1485 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1486
1487 /* store the folded codepoint */
1488 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1489
1490 if ( !UTF ) {
1491 /* store first byte of utf8 representation of
acdf4139
KW
1492 variant codepoints */
1493 if (! UNI_IS_INVARIANT(uvc)) {
1494 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1495 }
1496 }
02daf0ab
YO
1497 set_bit = 0; /* We've done our bit :-) */
1498 }
a3621e74
YO
1499 } else {
1500 SV** svpp;
55eed653
NC
1501 if ( !widecharmap )
1502 widecharmap = newHV();
a3621e74 1503
55eed653 1504 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1505
1506 if ( !svpp )
e4584336 1507 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1508
1509 if ( !SvTRUE( *svpp ) ) {
1510 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1511 TRIE_STORE_REVCHAR;
a3621e74
YO
1512 }
1513 }
1514 }
3dab1dad
YO
1515 if( cur == first ) {
1516 trie->minlen=chars;
1517 trie->maxlen=chars;
1518 } else if (chars < trie->minlen) {
1519 trie->minlen=chars;
1520 } else if (chars > trie->maxlen) {
1521 trie->maxlen=chars;
1522 }
1523
a3621e74
YO
1524 } /* end first pass */
1525 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1526 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1527 (int)depth * 2 + 2,"",
55eed653 1528 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1529 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1530 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1531 );
a3621e74
YO
1532
1533 /*
1534 We now know what we are dealing with in terms of unique chars and
1535 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1536 representation using a flat table will take. If it's over a reasonable
1537 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1538 conservative but potentially much slower representation using an array
1539 of lists.
1540
1541 At the end we convert both representations into the same compressed
1542 form that will be used in regexec.c for matching with. The latter
1543 is a form that cannot be used to construct with but has memory
1544 properties similar to the list form and access properties similar
1545 to the table form making it both suitable for fast searches and
1546 small enough that its feasable to store for the duration of a program.
1547
1548 See the comment in the code where the compressed table is produced
1549 inplace from the flat tabe representation for an explanation of how
1550 the compression works.
1551
1552 */
1553
1554
2e64971a
DM
1555 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1556 prev_states[1] = 0;
1557
3dab1dad 1558 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1559 /*
1560 Second Pass -- Array Of Lists Representation
1561
1562 Each state will be represented by a list of charid:state records
1563 (reg_trie_trans_le) the first such element holds the CUR and LEN
1564 points of the allocated array. (See defines above).
1565
1566 We build the initial structure using the lists, and then convert
1567 it into the compressed table form which allows faster lookups
1568 (but cant be modified once converted).
a3621e74
YO
1569 */
1570
a3621e74
YO
1571 STRLEN transcount = 1;
1572
1e2e3d02
YO
1573 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1574 "%*sCompiling trie using list compiler\n",
1575 (int)depth * 2 + 2, ""));
446bd890 1576
c944940b
JH
1577 trie->states = (reg_trie_state *)
1578 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1579 sizeof(reg_trie_state) );
a3621e74
YO
1580 TRIE_LIST_NEW(1);
1581 next_alloc = 2;
1582
1583 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1584
c445ea15
AL
1585 regnode * const noper = NEXTOPER( cur );
1586 U8 *uc = (U8*)STRING( noper );
1587 const U8 * const e = uc + STR_LEN( noper );
1588 U32 state = 1; /* required init */
1589 U16 charid = 0; /* sanity init */
1590 U8 *scan = (U8*)NULL; /* sanity init */
1591 STRLEN foldlen = 0; /* required init */
07be1b83 1592 U32 wordlen = 0; /* required init */
c445ea15
AL
1593 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1594
3dab1dad 1595 if (OP(noper) != NOTHING) {
786e8c11 1596 for ( ; uc < e ; uc += len ) {
c445ea15 1597
786e8c11 1598 TRIE_READ_CHAR;
c445ea15 1599
786e8c11
YO
1600 if ( uvc < 256 ) {
1601 charid = trie->charmap[ uvc ];
c445ea15 1602 } else {
55eed653 1603 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1604 if ( !svpp ) {
1605 charid = 0;
1606 } else {
1607 charid=(U16)SvIV( *svpp );
1608 }
c445ea15 1609 }
786e8c11
YO
1610 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1611 if ( charid ) {
a3621e74 1612
786e8c11
YO
1613 U16 check;
1614 U32 newstate = 0;
a3621e74 1615
786e8c11
YO
1616 charid--;
1617 if ( !trie->states[ state ].trans.list ) {
1618 TRIE_LIST_NEW( state );
c445ea15 1619 }
786e8c11
YO
1620 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1621 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1622 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1623 break;
1624 }
1625 }
1626 if ( ! newstate ) {
1627 newstate = next_alloc++;
2e64971a 1628 prev_states[newstate] = state;
786e8c11
YO
1629 TRIE_LIST_PUSH( state, charid, newstate );
1630 transcount++;
1631 }
1632 state = newstate;
1633 } else {
1634 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1635 }
a28509cc 1636 }
c445ea15 1637 }
3dab1dad 1638 TRIE_HANDLE_WORD(state);
a3621e74
YO
1639
1640 } /* end second pass */
1641
1e2e3d02
YO
1642 /* next alloc is the NEXT state to be allocated */
1643 trie->statecount = next_alloc;
c944940b
JH
1644 trie->states = (reg_trie_state *)
1645 PerlMemShared_realloc( trie->states,
1646 next_alloc
1647 * sizeof(reg_trie_state) );
a3621e74 1648
3dab1dad 1649 /* and now dump it out before we compress it */
2b8b4781
NC
1650 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1651 revcharmap, next_alloc,
1652 depth+1)
1e2e3d02 1653 );
a3621e74 1654
c944940b
JH
1655 trie->trans = (reg_trie_trans *)
1656 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1657 {
1658 U32 state;
a3621e74
YO
1659 U32 tp = 0;
1660 U32 zp = 0;
1661
1662
1663 for( state=1 ; state < next_alloc ; state ++ ) {
1664 U32 base=0;
1665
1666 /*
1667 DEBUG_TRIE_COMPILE_MORE_r(
1668 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1669 );
1670 */
1671
1672 if (trie->states[state].trans.list) {
1673 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1674 U16 maxid=minid;
a28509cc 1675 U16 idx;
a3621e74
YO
1676
1677 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1678 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1679 if ( forid < minid ) {
1680 minid=forid;
1681 } else if ( forid > maxid ) {
1682 maxid=forid;
1683 }
a3621e74
YO
1684 }
1685 if ( transcount < tp + maxid - minid + 1) {
1686 transcount *= 2;
c944940b
JH
1687 trie->trans = (reg_trie_trans *)
1688 PerlMemShared_realloc( trie->trans,
446bd890
NC
1689 transcount
1690 * sizeof(reg_trie_trans) );
a3621e74
YO
1691 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1692 }
1693 base = trie->uniquecharcount + tp - minid;
1694 if ( maxid == minid ) {
1695 U32 set = 0;
1696 for ( ; zp < tp ; zp++ ) {
1697 if ( ! trie->trans[ zp ].next ) {
1698 base = trie->uniquecharcount + zp - minid;
1699 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1700 trie->trans[ zp ].check = state;
1701 set = 1;
1702 break;
1703 }
1704 }
1705 if ( !set ) {
1706 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1707 trie->trans[ tp ].check = state;
1708 tp++;
1709 zp = tp;
1710 }
1711 } else {
1712 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1713 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1714 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1715 trie->trans[ tid ].check = state;
1716 }
1717 tp += ( maxid - minid + 1 );
1718 }
1719 Safefree(trie->states[ state ].trans.list);
1720 }
1721 /*
1722 DEBUG_TRIE_COMPILE_MORE_r(
1723 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1724 );
1725 */
1726 trie->states[ state ].trans.base=base;
1727 }
cc601c31 1728 trie->lasttrans = tp + 1;
a3621e74
YO
1729 }
1730 } else {
1731 /*
1732 Second Pass -- Flat Table Representation.
1733
1734 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1735 We know that we will need Charcount+1 trans at most to store the data
1736 (one row per char at worst case) So we preallocate both structures
1737 assuming worst case.
1738
1739 We then construct the trie using only the .next slots of the entry
1740 structs.
1741
3b753521 1742 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1743 make compression both faster and easier by keeping track of how many non
1744 zero fields are in the node.
1745
1746 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1747 transition.
1748
1749 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1750 number representing the first entry of the node, and state as a
1751 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1752 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1753 are 2 entrys per node. eg:
1754
1755 A B A B
1756 1. 2 4 1. 3 7
1757 2. 0 3 3. 0 5
1758 3. 0 0 5. 0 0
1759 4. 0 0 7. 0 0
1760
1761 The table is internally in the right hand, idx form. However as we also
1762 have to deal with the states array which is indexed by nodenum we have to
1763 use TRIE_NODENUM() to convert.
1764
1765 */
1e2e3d02
YO
1766 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1767 "%*sCompiling trie using table compiler\n",
1768 (int)depth * 2 + 2, ""));
3dab1dad 1769
c944940b
JH
1770 trie->trans = (reg_trie_trans *)
1771 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1772 * trie->uniquecharcount + 1,
1773 sizeof(reg_trie_trans) );
1774 trie->states = (reg_trie_state *)
1775 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1776 sizeof(reg_trie_state) );
a3621e74
YO
1777 next_alloc = trie->uniquecharcount + 1;
1778
3dab1dad 1779
a3621e74
YO
1780 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1781
c445ea15 1782 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1783 const U8 *uc = (U8*)STRING( noper );
1784 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1785
1786 U32 state = 1; /* required init */
1787
1788 U16 charid = 0; /* sanity init */
1789 U32 accept_state = 0; /* sanity init */
1790 U8 *scan = (U8*)NULL; /* sanity init */
1791
1792 STRLEN foldlen = 0; /* required init */
07be1b83 1793 U32 wordlen = 0; /* required init */
a3621e74
YO
1794 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1795
3dab1dad 1796 if ( OP(noper) != NOTHING ) {
786e8c11 1797 for ( ; uc < e ; uc += len ) {
a3621e74 1798
786e8c11 1799 TRIE_READ_CHAR;
a3621e74 1800
786e8c11
YO
1801 if ( uvc < 256 ) {
1802 charid = trie->charmap[ uvc ];
1803 } else {
55eed653 1804 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1805 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1806 }
786e8c11
YO
1807 if ( charid ) {
1808 charid--;
1809 if ( !trie->trans[ state + charid ].next ) {
1810 trie->trans[ state + charid ].next = next_alloc;
1811 trie->trans[ state ].check++;
2e64971a
DM
1812 prev_states[TRIE_NODENUM(next_alloc)]
1813 = TRIE_NODENUM(state);
786e8c11
YO
1814 next_alloc += trie->uniquecharcount;
1815 }
1816 state = trie->trans[ state + charid ].next;
1817 } else {
1818 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1819 }
1820 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1821 }
a3621e74 1822 }
3dab1dad
YO
1823 accept_state = TRIE_NODENUM( state );
1824 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1825
1826 } /* end second pass */
1827
3dab1dad 1828 /* and now dump it out before we compress it */
2b8b4781
NC
1829 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1830 revcharmap,
1831 next_alloc, depth+1));
a3621e74 1832
a3621e74
YO
1833 {
1834 /*
1835 * Inplace compress the table.*
1836
1837 For sparse data sets the table constructed by the trie algorithm will
1838 be mostly 0/FAIL transitions or to put it another way mostly empty.
1839 (Note that leaf nodes will not contain any transitions.)
1840
1841 This algorithm compresses the tables by eliminating most such
1842 transitions, at the cost of a modest bit of extra work during lookup:
1843
1844 - Each states[] entry contains a .base field which indicates the
1845 index in the state[] array wheres its transition data is stored.
1846
3b753521 1847 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1848
1849 - If .base is nonzero then charid is added to it to find an entry in
1850 the trans array.
1851
1852 -If trans[states[state].base+charid].check!=state then the
1853 transition is taken to be a 0/Fail transition. Thus if there are fail
1854 transitions at the front of the node then the .base offset will point
1855 somewhere inside the previous nodes data (or maybe even into a node
1856 even earlier), but the .check field determines if the transition is
1857 valid.
1858
786e8c11 1859 XXX - wrong maybe?
a3621e74 1860 The following process inplace converts the table to the compressed
3b753521 1861 table: We first do not compress the root node 1,and mark all its
a3621e74 1862 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1863 allows us to do a DFA construction from the compressed table later,
1864 and ensures that any .base pointers we calculate later are greater
1865 than 0.
a3621e74
YO
1866
1867 - We set 'pos' to indicate the first entry of the second node.
1868
1869 - We then iterate over the columns of the node, finding the first and
1870 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1871 and set the .check pointers accordingly, and advance pos
1872 appropriately and repreat for the next node. Note that when we copy
1873 the next pointers we have to convert them from the original
1874 NODEIDX form to NODENUM form as the former is not valid post
1875 compression.
1876
1877 - If a node has no transitions used we mark its base as 0 and do not
1878 advance the pos pointer.
1879
1880 - If a node only has one transition we use a second pointer into the
1881 structure to fill in allocated fail transitions from other states.
1882 This pointer is independent of the main pointer and scans forward
1883 looking for null transitions that are allocated to a state. When it
1884 finds one it writes the single transition into the "hole". If the
786e8c11 1885 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1886
1887 - Once compressed we can Renew/realloc the structures to release the
1888 excess space.
1889
1890 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1891 specifically Fig 3.47 and the associated pseudocode.
1892
1893 demq
1894 */
a3b680e6 1895 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1896 U32 state, charid;
a3621e74 1897 U32 pos = 0, zp=0;
1e2e3d02 1898 trie->statecount = laststate;
a3621e74
YO
1899
1900 for ( state = 1 ; state < laststate ; state++ ) {
1901 U8 flag = 0;
a28509cc
AL
1902 const U32 stateidx = TRIE_NODEIDX( state );
1903 const U32 o_used = trie->trans[ stateidx ].check;
1904 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1905 trie->trans[ stateidx ].check = 0;
1906
1907 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1908 if ( flag || trie->trans[ stateidx + charid ].next ) {
1909 if ( trie->trans[ stateidx + charid ].next ) {
1910 if (o_used == 1) {
1911 for ( ; zp < pos ; zp++ ) {
1912 if ( ! trie->trans[ zp ].next ) {
1913 break;
1914 }
1915 }
1916 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1917 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1918 trie->trans[ zp ].check = state;
1919 if ( ++zp > pos ) pos = zp;
1920 break;
1921 }
1922 used--;
1923 }
1924 if ( !flag ) {
1925 flag = 1;
1926 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1927 }
1928 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1929 trie->trans[ pos ].check = state;
1930 pos++;
1931 }
1932 }
1933 }
cc601c31 1934 trie->lasttrans = pos + 1;
c944940b
JH
1935 trie->states = (reg_trie_state *)
1936 PerlMemShared_realloc( trie->states, laststate
1937 * sizeof(reg_trie_state) );
a3621e74 1938 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1939 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1940 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1941 (int)depth * 2 + 2,"",
1942 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1943 (IV)next_alloc,
1944 (IV)pos,
a3621e74
YO
1945 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1946 );
1947
1948 } /* end table compress */
1949 }
1e2e3d02
YO
1950 DEBUG_TRIE_COMPILE_MORE_r(
1951 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1952 (int)depth * 2 + 2, "",
1953 (UV)trie->statecount,
1954 (UV)trie->lasttrans)
1955 );
cc601c31 1956 /* resize the trans array to remove unused space */
c944940b
JH
1957 trie->trans = (reg_trie_trans *)
1958 PerlMemShared_realloc( trie->trans, trie->lasttrans
1959 * sizeof(reg_trie_trans) );
a3621e74 1960
3b753521 1961 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
1962 U8 nodetype =(U8)(flags & 0xFF);
1963 char *str=NULL;
786e8c11 1964
07be1b83 1965#ifdef DEBUGGING
e62cc96a 1966 regnode *optimize = NULL;
7122b237
YO
1967#ifdef RE_TRACK_PATTERN_OFFSETS
1968
b57a0404
JH
1969 U32 mjd_offset = 0;
1970 U32 mjd_nodelen = 0;
7122b237
YO
1971#endif /* RE_TRACK_PATTERN_OFFSETS */
1972#endif /* DEBUGGING */
a3621e74 1973 /*
3dab1dad
YO
1974 This means we convert either the first branch or the first Exact,
1975 depending on whether the thing following (in 'last') is a branch
1976 or not and whther first is the startbranch (ie is it a sub part of
1977 the alternation or is it the whole thing.)
3b753521 1978 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 1979 the whole branch sequence, including the first.
a3621e74 1980 */
3dab1dad 1981 /* Find the node we are going to overwrite */
7f69552c 1982 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1983 /* branch sub-chain */
3dab1dad 1984 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1985#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1986 DEBUG_r({
1987 mjd_offset= Node_Offset((convert));
1988 mjd_nodelen= Node_Length((convert));
1989 });
7122b237 1990#endif
7f69552c 1991 /* whole branch chain */
7122b237
YO
1992 }
1993#ifdef RE_TRACK_PATTERN_OFFSETS
1994 else {
7f69552c
YO
1995 DEBUG_r({
1996 const regnode *nop = NEXTOPER( convert );
1997 mjd_offset= Node_Offset((nop));
1998 mjd_nodelen= Node_Length((nop));
1999 });
07be1b83
YO
2000 }
2001 DEBUG_OPTIMISE_r(
2002 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2003 (int)depth * 2 + 2, "",
786e8c11 2004 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2005 );
7122b237 2006#endif
3dab1dad
YO
2007 /* But first we check to see if there is a common prefix we can
2008 split out as an EXACT and put in front of the TRIE node. */
2009 trie->startstate= 1;
55eed653 2010 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2011 U32 state;
1e2e3d02 2012 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2013 U32 ofs = 0;
8e11feef
RGS
2014 I32 idx = -1;
2015 U32 count = 0;
2016 const U32 base = trie->states[ state ].trans.base;
a3621e74 2017
3dab1dad 2018 if ( trie->states[state].wordnum )
8e11feef 2019 count = 1;
a3621e74 2020
8e11feef 2021 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2022 if ( ( base + ofs >= trie->uniquecharcount ) &&
2023 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2024 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2025 {
3dab1dad 2026 if ( ++count > 1 ) {
2b8b4781 2027 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2028 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2029 if ( state == 1 ) break;
3dab1dad
YO
2030 if ( count == 2 ) {
2031 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2032 DEBUG_OPTIMISE_r(
8e11feef
RGS
2033 PerlIO_printf(Perl_debug_log,
2034 "%*sNew Start State=%"UVuf" Class: [",
2035 (int)depth * 2 + 2, "",
786e8c11 2036 (UV)state));
be8e71aa 2037 if (idx >= 0) {
2b8b4781 2038 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2039 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2040
3dab1dad 2041 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2042 if ( folder )
2043 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2044 DEBUG_OPTIMISE_r(
f1f66076 2045 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2046 );
8e11feef
RGS
2047 }
2048 }
2049 TRIE_BITMAP_SET(trie,*ch);
2050 if ( folder )
2051 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2052 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2053 }
2054 idx = ofs;
2055 }
3dab1dad
YO
2056 }
2057 if ( count == 1 ) {
2b8b4781 2058 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2059 STRLEN len;
2060 char *ch = SvPV( *tmp, len );
de734bd5
A
2061 DEBUG_OPTIMISE_r({
2062 SV *sv=sv_newmortal();
8e11feef
RGS
2063 PerlIO_printf( Perl_debug_log,
2064 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2065 (int)depth * 2 + 2, "",
de734bd5
A
2066 (UV)state, (UV)idx,
2067 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2068 PL_colors[0], PL_colors[1],
2069 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2070 PERL_PV_ESCAPE_FIRSTCHAR
2071 )
2072 );
2073 });
3dab1dad
YO
2074 if ( state==1 ) {
2075 OP( convert ) = nodetype;
2076 str=STRING(convert);
2077 STR_LEN(convert)=0;
2078 }
c490c714
YO
2079 STR_LEN(convert) += len;
2080 while (len--)
de734bd5 2081 *str++ = *ch++;
8e11feef 2082 } else {
f9049ba1 2083#ifdef DEBUGGING
8e11feef
RGS
2084 if (state>1)
2085 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2086#endif
8e11feef
RGS
2087 break;
2088 }
2089 }
2e64971a 2090 trie->prefixlen = (state-1);
3dab1dad 2091 if (str) {
8e11feef 2092 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2093 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2094 trie->startstate = state;
07be1b83
YO
2095 trie->minlen -= (state - 1);
2096 trie->maxlen -= (state - 1);
33809eae
JH
2097#ifdef DEBUGGING
2098 /* At least the UNICOS C compiler choked on this
2099 * being argument to DEBUG_r(), so let's just have
2100 * it right here. */
2101 if (
2102#ifdef PERL_EXT_RE_BUILD
2103 1
2104#else
2105 DEBUG_r_TEST
2106#endif
2107 ) {
2108 regnode *fix = convert;
2109 U32 word = trie->wordcount;
2110 mjd_nodelen++;
2111 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2112 while( ++fix < n ) {
2113 Set_Node_Offset_Length(fix, 0, 0);
2114 }
2115 while (word--) {
2116 SV ** const tmp = av_fetch( trie_words, word, 0 );
2117 if (tmp) {
2118 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2119 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2120 else
2121 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2122 }
2123 }
2124 }
2125#endif
8e11feef
RGS
2126 if (trie->maxlen) {
2127 convert = n;
2128 } else {
3dab1dad 2129 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2130 DEBUG_r(optimize= n);
3dab1dad
YO
2131 }
2132 }
2133 }
a5ca303d
YO
2134 if (!jumper)
2135 jumper = last;
3dab1dad 2136 if ( trie->maxlen ) {
8e11feef
RGS
2137 NEXT_OFF( convert ) = (U16)(tail - convert);
2138 ARG_SET( convert, data_slot );
786e8c11
YO
2139 /* Store the offset to the first unabsorbed branch in
2140 jump[0], which is otherwise unused by the jump logic.
2141 We use this when dumping a trie and during optimisation. */
2142 if (trie->jump)
7f69552c 2143 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2144
6c48061a
YO
2145 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2146 * and there is a bitmap
2147 * and the first "jump target" node we found leaves enough room
2148 * then convert the TRIE node into a TRIEC node, with the bitmap
2149 * embedded inline in the opcode - this is hypothetically faster.
2150 */
2151 if ( !trie->states[trie->startstate].wordnum
2152 && trie->bitmap
2153 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2154 {
2155 OP( convert ) = TRIEC;
2156 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2157 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2158 trie->bitmap= NULL;
2159 } else
2160 OP( convert ) = TRIE;
a3621e74 2161
3dab1dad
YO
2162 /* store the type in the flags */
2163 convert->flags = nodetype;
a5ca303d
YO
2164 DEBUG_r({
2165 optimize = convert
2166 + NODE_STEP_REGNODE
2167 + regarglen[ OP( convert ) ];
2168 });
2169 /* XXX We really should free up the resource in trie now,
2170 as we won't use them - (which resources?) dmq */
3dab1dad 2171 }
a3621e74 2172 /* needed for dumping*/
e62cc96a 2173 DEBUG_r(if (optimize) {
07be1b83 2174 regnode *opt = convert;
bcdf7404 2175
e62cc96a 2176 while ( ++opt < optimize) {
07be1b83
YO
2177 Set_Node_Offset_Length(opt,0,0);
2178 }
786e8c11
YO
2179 /*
2180 Try to clean up some of the debris left after the
2181 optimisation.
a3621e74 2182 */
786e8c11 2183 while( optimize < jumper ) {
07be1b83 2184 mjd_nodelen += Node_Length((optimize));
a3621e74 2185 OP( optimize ) = OPTIMIZED;
07be1b83 2186 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2187 optimize++;
2188 }
07be1b83 2189 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2190 });
2191 } /* end node insert */
2e64971a
DM
2192
2193 /* Finish populating the prev field of the wordinfo array. Walk back
2194 * from each accept state until we find another accept state, and if
2195 * so, point the first word's .prev field at the second word. If the
2196 * second already has a .prev field set, stop now. This will be the
2197 * case either if we've already processed that word's accept state,
3b753521
FN
2198 * or that state had multiple words, and the overspill words were
2199 * already linked up earlier.
2e64971a
DM
2200 */
2201 {
2202 U16 word;
2203 U32 state;
2204 U16 prev;
2205
2206 for (word=1; word <= trie->wordcount; word++) {
2207 prev = 0;
2208 if (trie->wordinfo[word].prev)
2209 continue;
2210 state = trie->wordinfo[word].accept;
2211 while (state) {
2212 state = prev_states[state];
2213 if (!state)
2214 break;
2215 prev = trie->states[state].wordnum;
2216 if (prev)
2217 break;
2218 }
2219 trie->wordinfo[word].prev = prev;
2220 }
2221 Safefree(prev_states);
2222 }
2223
2224
2225 /* and now dump out the compressed format */
2226 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2227
55eed653 2228 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2229#ifdef DEBUGGING
2230 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2231 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2232#else
2233 SvREFCNT_dec(revcharmap);
07be1b83 2234#endif
786e8c11
YO
2235 return trie->jump
2236 ? MADE_JUMP_TRIE
2237 : trie->startstate>1
2238 ? MADE_EXACT_TRIE
2239 : MADE_TRIE;
2240}
2241
2242STATIC void
2243S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2244{
3b753521 2245/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2246
2247 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2248 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2249 ISBN 0-201-10088-6
2250
2251 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2252 suffix of the current state's 'word' that is also a proper prefix of another word in our
2253 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2254 the DFA not to have to restart after its tried and failed a word at a given point, it
2255 simply continues as though it had been matching the other word in the first place.
2256 Consider
2257 'abcdgu'=~/abcdefg|cdgu/
2258 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2259 fail, which would bring us to the state representing 'd' in the second word where we would
2260 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2261 */
2262 /* add a fail transition */
3251b653
NC
2263 const U32 trie_offset = ARG(source);
2264 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2265 U32 *q;
2266 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2267 const U32 numstates = trie->statecount;
786e8c11
YO
2268 const U32 ubound = trie->lasttrans + ucharcount;
2269 U32 q_read = 0;
2270 U32 q_write = 0;
2271 U32 charid;
2272 U32 base = trie->states[ 1 ].trans.base;
2273 U32 *fail;
2274 reg_ac_data *aho;
2275 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2276 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2277
2278 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2279#ifndef DEBUGGING
2280 PERL_UNUSED_ARG(depth);
2281#endif
2282
2283
2284 ARG_SET( stclass, data_slot );
c944940b 2285 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2286 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2287 aho->trie=trie_offset;
446bd890
NC
2288 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2289 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2290 Newxz( q, numstates, U32);
c944940b 2291 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2292 aho->refcount = 1;
2293 fail = aho->fail;
2294 /* initialize fail[0..1] to be 1 so that we always have
2295 a valid final fail state */
2296 fail[ 0 ] = fail[ 1 ] = 1;
2297
2298 for ( charid = 0; charid < ucharcount ; charid++ ) {
2299 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2300 if ( newstate ) {
2301 q[ q_write ] = newstate;
2302 /* set to point at the root */
2303 fail[ q[ q_write++ ] ]=1;
2304 }
2305 }
2306 while ( q_read < q_write) {
2307 const U32 cur = q[ q_read++ % numstates ];
2308 base = trie->states[ cur ].trans.base;
2309
2310 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2311 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2312 if (ch_state) {
2313 U32 fail_state = cur;
2314 U32 fail_base;
2315 do {
2316 fail_state = fail[ fail_state ];
2317 fail_base = aho->states[ fail_state ].trans.base;
2318 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2319
2320 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2321 fail[ ch_state ] = fail_state;
2322 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2323 {
2324 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2325 }
2326 q[ q_write++ % numstates] = ch_state;
2327 }
2328 }
2329 }
2330 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2331 when we fail in state 1, this allows us to use the
2332 charclass scan to find a valid start char. This is based on the principle
2333 that theres a good chance the string being searched contains lots of stuff
2334 that cant be a start char.
2335 */
2336 fail[ 0 ] = fail[ 1 ] = 0;
2337 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2338 PerlIO_printf(Perl_debug_log,
2339 "%*sStclass Failtable (%"UVuf" states): 0",
2340 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2341 );
786e8c11
YO
2342 for( q_read=1; q_read<numstates; q_read++ ) {
2343 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2344 }
2345 PerlIO_printf(Perl_debug_log, "\n");
2346 });
2347 Safefree(q);
2348 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2349}
2350
786e8c11 2351
a3621e74 2352/*
5d1c421c
JH
2353 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2354 * These need to be revisited when a newer toolchain becomes available.
2355 */
2356#if defined(__sparc64__) && defined(__GNUC__)
2357# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2358# undef SPARC64_GCC_WORKAROUND
2359# define SPARC64_GCC_WORKAROUND 1
2360# endif
2361#endif
2362
07be1b83 2363#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2364 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2365 SV * const mysv=sv_newmortal(); \
2366 regnode *Next = regnext(scan); \
2367 regprop(RExC_rx, mysv, scan); \
7f69552c 2368 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2369 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2370 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2371 }});
07be1b83 2372
1de06328
YO
2373
2374
2375
2376
07be1b83
YO
2377#define JOIN_EXACT(scan,min,flags) \
2378 if (PL_regkind[OP(scan)] == EXACT) \
2379 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2380
be8e71aa 2381STATIC U32
07be1b83
YO
2382S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2383 /* Merge several consecutive EXACTish nodes into one. */
2384 regnode *n = regnext(scan);
2385 U32 stringok = 1;
2386 regnode *next = scan + NODE_SZ_STR(scan);
2387 U32 merged = 0;
2388 U32 stopnow = 0;
2389#ifdef DEBUGGING
2390 regnode *stop = scan;
72f13be8 2391 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2392#else
d47053eb
RGS
2393 PERL_UNUSED_ARG(depth);
2394#endif
7918f24d
NC
2395
2396 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2397#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2398 PERL_UNUSED_ARG(flags);
2399 PERL_UNUSED_ARG(val);
07be1b83 2400#endif
07be1b83
YO
2401 DEBUG_PEEP("join",scan,depth);
2402
2403 /* Skip NOTHING, merge EXACT*. */
2404 while (n &&
2405 ( PL_regkind[OP(n)] == NOTHING ||
2406 (stringok && (OP(n) == OP(scan))))
2407 && NEXT_OFF(n)
2408 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2409
2410 if (OP(n) == TAIL || n > next)
2411 stringok = 0;
2412 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2413 DEBUG_PEEP("skip:",n,depth);
2414 NEXT_OFF(scan) += NEXT_OFF(n);
2415 next = n + NODE_STEP_REGNODE;
2416#ifdef DEBUGGING
2417 if (stringok)
2418 stop = n;
2419#endif
2420 n = regnext(n);
2421 }
2422 else if (stringok) {
786e8c11 2423 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2424 regnode * const nnext = regnext(n);
2425
2426 DEBUG_PEEP("merg",n,depth);
2427
2428 merged++;
2429 if (oldl + STR_LEN(n) > U8_MAX)
2430 break;
2431 NEXT_OFF(scan) += NEXT_OFF(n);
2432 STR_LEN(scan) += STR_LEN(n);
2433 next = n + NODE_SZ_STR(n);
2434 /* Now we can overwrite *n : */
2435 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2436#ifdef DEBUGGING
2437 stop = next - 1;
2438#endif
2439 n = nnext;
2440 if (stopnow) break;
2441 }
2442
d47053eb
RGS
2443#ifdef EXPERIMENTAL_INPLACESCAN
2444 if (flags && !NEXT_OFF(n)) {
2445 DEBUG_PEEP("atch", val, depth);
2446 if (reg_off_by_arg[OP(n)]) {
2447 ARG_SET(n, val - n);
2448 }
2449 else {
2450 NEXT_OFF(n) = val - n;
2451 }
2452 stopnow = 1;
2453 }
07be1b83
YO
2454#endif
2455 }
ced7f090
KW
2456#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2457#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2458#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2459#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2c2b7f86
KW
2460
2461 if (UTF
2462 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2463 && ( STR_LEN(scan) >= 6 ) )
2464 {
07be1b83
YO
2465 /*
2466 Two problematic code points in Unicode casefolding of EXACT nodes:
2467
2468 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2469 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2470
2471 which casefold to
2472
2473 Unicode UTF-8
2474
2475 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2476 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2477
2478 This means that in case-insensitive matching (or "loose matching",
2479 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2480 length of the above casefolded versions) can match a target string
2481 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2482 This would rather mess up the minimum length computation.
2483
2484 What we'll do is to look for the tail four bytes, and then peek
2485 at the preceding two bytes to see whether we need to decrease
2486 the minimum length by four (six minus two).
2487
2488 Thanks to the design of UTF-8, there cannot be false matches:
2489 A sequence of valid UTF-8 bytes cannot be a subsequence of
2490 another valid sequence of UTF-8 bytes.
2491
2492 */
2493 char * const s0 = STRING(scan), *s, *t;
2494 char * const s1 = s0 + STR_LEN(scan) - 1;
2495 char * const s2 = s1 - 4;
e294cc5d
JH
2496#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2497 const char t0[] = "\xaf\x49\xaf\x42";
2498#else
07be1b83 2499 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2500#endif
07be1b83
YO
2501 const char * const t1 = t0 + 3;
2502
2503 for (s = s0 + 2;
2504 s < s2 && (t = ninstr(s, s1, t0, t1));
2505 s = t + 4) {
e294cc5d
JH
2506#ifdef EBCDIC
2507 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2508 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2509#else
07be1b83
YO
2510 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2511 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2512#endif
07be1b83
YO
2513 *min -= 4;
2514 }
2515 }
2516
2517#ifdef DEBUGGING
2518 /* Allow dumping */
2519 n = scan + NODE_SZ_STR(scan);
2520 while (n <= stop) {
2521 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2522 OP(n) = OPTIMIZED;
2523 NEXT_OFF(n) = 0;
2524 }
2525 n++;
2526 }
2527#endif
2528 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2529 return stopnow;
2530}
2531
486ec47a 2532/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2533 Finds fixed substrings. */
2534
a0288114 2535/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2536 to the position after last scanned or to NULL. */
2537
40d049e4
YO
2538#define INIT_AND_WITHP \
2539 assert(!and_withp); \
2540 Newx(and_withp,1,struct regnode_charclass_class); \
2541 SAVEFREEPV(and_withp)
07be1b83 2542
b515a41d 2543/* this is a chain of data about sub patterns we are processing that
486ec47a 2544 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2545 we can simulate recursion without losing state. */
2546struct scan_frame;
2547typedef struct scan_frame {
2548 regnode *last; /* last node to process in this frame */
2549 regnode *next; /* next node to process when last is reached */
2550 struct scan_frame *prev; /*previous frame*/
2551 I32 stop; /* what stopparen do we use */
2552} scan_frame;
2553
304ee84b
YO
2554
2555#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2556
e1d1eefb
YO
2557#define CASE_SYNST_FNC(nAmE) \
2558case nAmE: \
2559 if (flags & SCF_DO_STCLASS_AND) { \
2560 for (value = 0; value < 256; value++) \
2561 if (!is_ ## nAmE ## _cp(value)) \
2562 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2563 } \
2564 else { \
2565 for (value = 0; value < 256; value++) \
2566 if (is_ ## nAmE ## _cp(value)) \
2567 ANYOF_BITMAP_SET(data->start_class, value); \
2568 } \
2569 break; \
2570case N ## nAmE: \
2571 if (flags & SCF_DO_STCLASS_AND) { \
2572 for (value = 0; value < 256; value++) \
2573 if (is_ ## nAmE ## _cp(value)) \
2574 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2575 } \
2576 else { \
2577 for (value = 0; value < 256; value++) \
2578 if (!is_ ## nAmE ## _cp(value)) \
2579 ANYOF_BITMAP_SET(data->start_class, value); \
2580 } \
2581 break
2582
2583
2584
76e3520e 2585STATIC I32
40d049e4 2586S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2587 I32 *minlenp, I32 *deltap,
40d049e4
YO
2588 regnode *last,
2589 scan_data_t *data,
2590 I32 stopparen,
2591 U8* recursed,
2592 struct regnode_charclass_class *and_withp,
2593 U32 flags, U32 depth)
c277df42
IZ
2594 /* scanp: Start here (read-write). */
2595 /* deltap: Write maxlen-minlen here. */
2596 /* last: Stop before this one. */
40d049e4
YO
2597 /* data: string data about the pattern */
2598 /* stopparen: treat close N as END */
2599 /* recursed: which subroutines have we recursed into */
2600 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2601{
97aff369 2602 dVAR;
c277df42
IZ
2603 I32 min = 0, pars = 0, code;
2604 regnode *scan = *scanp, *next;
2605 I32 delta = 0;
2606 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2607 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2608 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2609 scan_data_t data_fake;
a3621e74 2610 SV *re_trie_maxbuff = NULL;
786e8c11 2611 regnode *first_non_open = scan;
e2e6a0f1 2612 I32 stopmin = I32_MAX;
8aa23a47 2613 scan_frame *frame = NULL;
a3621e74 2614 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2615
7918f24d
NC
2616 PERL_ARGS_ASSERT_STUDY_CHUNK;
2617
13a24bad 2618#ifdef DEBUGGING
40d049e4 2619 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2620#endif
40d049e4 2621
786e8c11 2622 if ( depth == 0 ) {
40d049e4 2623 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2624 first_non_open=regnext(first_non_open);
2625 }
2626
b81d288d 2627
8aa23a47
YO
2628 fake_study_recurse:
2629 while ( scan && OP(scan) != END && scan < last ){
2630 /* Peephole optimizer: */
304ee84b 2631 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2632 DEBUG_PEEP("Peep",scan,depth);
2633 JOIN_EXACT(scan,&min,0);
2634
2635 /* Follow the next-chain of the current node and optimize
2636 away all the NOTHINGs from it. */
2637 if (OP(scan) != CURLYX) {
2638 const int max = (reg_off_by_arg[OP(scan)]
2639 ? I32_MAX
2640 /* I32 may be smaller than U16 on CRAYs! */
2641 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2642 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2643 int noff;
2644 regnode *n = scan;
2645
2646 /* Skip NOTHING and LONGJMP. */
2647 while ((n = regnext(n))
2648 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2649 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2650 && off + noff < max)
2651 off += noff;
2652 if (reg_off_by_arg[OP(scan)])
2653 ARG(scan) = off;
2654 else
2655 NEXT_OFF(scan) = off;
2656 }
a3621e74 2657
c277df42 2658
8aa23a47
YO
2659
2660 /* The principal pseudo-switch. Cannot be a switch, since we
2661 look into several different things. */
2662 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2663 || OP(scan) == IFTHEN) {
2664 next = regnext(scan);
2665 code = OP(scan);
2666 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2667
2668 if (OP(next) == code || code == IFTHEN) {
2669 /* NOTE - There is similar code to this block below for handling
2670 TRIE nodes on a re-study. If you change stuff here check there
2671 too. */
2672 I32 max1 = 0, min1 = I32_MAX, num = 0;
2673 struct regnode_charclass_class accum;
2674 regnode * const startbranch=scan;
2675
2676 if (flags & SCF_DO_SUBSTR)
304ee84b 2677 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2678 if (flags & SCF_DO_STCLASS)
2679 cl_init_zero(pRExC_state, &accum);
2680
2681 while (OP(scan) == code) {
2682 I32 deltanext, minnext, f = 0, fake;
2683 struct regnode_charclass_class this_class;
2684
2685 num++;
2686 data_fake.flags = 0;
2687 if (data) {
2688 data_fake.whilem_c = data->whilem_c;
2689 data_fake.last_closep = data->last_closep;
2690 }
2691 else
2692 data_fake.last_closep = &fake;
58e23c8d
YO
2693
2694 data_fake.pos_delta = delta;
8aa23a47
YO
2695 next = regnext(scan);
2696 scan = NEXTOPER(scan);
2697 if (code != BRANCH)
c277df42 2698 scan = NEXTOPER(scan);
8aa23a47
YO
2699 if (flags & SCF_DO_STCLASS) {
2700 cl_init(pRExC_state, &this_class);
2701 data_fake.start_class = &this_class;
2702 f = SCF_DO_STCLASS_AND;
58e23c8d 2703 }
8aa23a47
YO
2704 if (flags & SCF_WHILEM_VISITED_POS)
2705 f |= SCF_WHILEM_VISITED_POS;
2706
2707 /* we suppose the run is continuous, last=next...*/
2708 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2709 next, &data_fake,
2710 stopparen, recursed, NULL, f,depth+1);
2711 if (min1 > minnext)
2712 min1 = minnext;
2713 if (max1 < minnext + deltanext)
2714 max1 = minnext + deltanext;
2715 if (deltanext == I32_MAX)
2716 is_inf = is_inf_internal = 1;
2717 scan = next;
2718 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2719 pars++;
2720 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2721 if ( stopmin > minnext)
2722 stopmin = min + min1;
2723 flags &= ~SCF_DO_SUBSTR;
2724 if (data)
2725 data->flags |= SCF_SEEN_ACCEPT;
2726 }
2727 if (data) {
2728 if (data_fake.flags & SF_HAS_EVAL)
2729 data->flags |= SF_HAS_EVAL;
2730 data->whilem_c = data_fake.whilem_c;
3dab1dad 2731 }
8aa23a47
YO
2732 if (flags & SCF_DO_STCLASS)
2733 cl_or(pRExC_state, &accum, &this_class);
2734 }
2735 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2736 min1 = 0;
2737 if (flags & SCF_DO_SUBSTR) {
2738 data->pos_min += min1;
2739 data->pos_delta += max1 - min1;
2740 if (max1 != min1 || is_inf)
2741 data->longest = &(data->longest_float);
2742 }
2743 min += min1;
2744 delta += max1 - min1;
2745 if (flags & SCF_DO_STCLASS_OR) {
2746 cl_or(pRExC_state, data->start_class, &accum);
2747 if (min1) {
2748 cl_and(data->start_class, and_withp);
2749 flags &= ~SCF_DO_STCLASS;
653099ff 2750 }
8aa23a47
YO
2751 }
2752 else if (flags & SCF_DO_STCLASS_AND) {
2753 if (min1) {
2754 cl_and(data->start_class, &accum);
2755 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2756 }
8aa23a47
YO
2757 else {
2758 /* Switch to OR mode: cache the old value of
2759 * data->start_class */
2760 INIT_AND_WITHP;
2761 StructCopy(data->start_class, and_withp,
2762 struct regnode_charclass_class);
2763 flags &= ~SCF_DO_STCLASS_AND;
2764 StructCopy(&accum, data->start_class,
2765 struct regnode_charclass_class);
2766 flags |= SCF_DO_STCLASS_OR;
2767 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2768 }
8aa23a47 2769 }
a3621e74 2770
8aa23a47
YO
2771 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2772 /* demq.
a3621e74 2773
8aa23a47
YO
2774 Assuming this was/is a branch we are dealing with: 'scan' now
2775 points at the item that follows the branch sequence, whatever
2776 it is. We now start at the beginning of the sequence and look
2777 for subsequences of
a3621e74 2778
8aa23a47
YO
2779 BRANCH->EXACT=>x1
2780 BRANCH->EXACT=>x2
2781 tail
a3621e74 2782
8aa23a47 2783 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2784
486ec47a 2785 If we can find such a subsequence we need to turn the first
8aa23a47
YO
2786 element into a trie and then add the subsequent branch exact
2787 strings to the trie.
a3621e74 2788
8aa23a47 2789 We have two cases
a3621e74 2790
3b753521 2791 1. patterns where the whole set of branches can be converted.
a3621e74 2792
8aa23a47 2793 2. patterns where only a subset can be converted.
a3621e74 2794
8aa23a47
YO
2795 In case 1 we can replace the whole set with a single regop
2796 for the trie. In case 2 we need to keep the start and end
3b753521 2797 branches so
a3621e74 2798
8aa23a47
YO
2799 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2800 becomes BRANCH TRIE; BRANCH X;
786e8c11 2801
8aa23a47
YO
2802 There is an additional case, that being where there is a
2803 common prefix, which gets split out into an EXACT like node
2804 preceding the TRIE node.
a3621e74 2805
8aa23a47
YO
2806 If x(1..n)==tail then we can do a simple trie, if not we make
2807 a "jump" trie, such that when we match the appropriate word
486ec47a 2808 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 2809 a nested if into a case structure of sorts.
b515a41d 2810
8aa23a47
YO
2811 */
2812
2813 int made=0;
2814 if (!re_trie_maxbuff) {
2815 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2816 if (!SvIOK(re_trie_maxbuff))
2817 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2818 }
2819 if ( SvIV(re_trie_maxbuff)>=0 ) {
2820 regnode *cur;
2821 regnode *first = (regnode *)NULL;
2822 regnode *last = (regnode *)NULL;
2823 regnode *tail = scan;
2824 U8 optype = 0;
2825 U32 count=0;
a3621e74
YO
2826
2827#ifdef DEBUGGING
8aa23a47 2828 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2829#endif
8aa23a47
YO
2830 /* var tail is used because there may be a TAIL
2831 regop in the way. Ie, the exacts will point to the
2832 thing following the TAIL, but the last branch will
2833 point at the TAIL. So we advance tail. If we
2834 have nested (?:) we may have to move through several
2835 tails.
2836 */
2837
2838 while ( OP( tail ) == TAIL ) {
2839 /* this is the TAIL generated by (?:) */
2840 tail = regnext( tail );
2841 }
a3621e74 2842
8aa23a47
YO
2843
2844 DEBUG_OPTIMISE_r({
2845 regprop(RExC_rx, mysv, tail );
2846 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2847 (int)depth * 2 + 2, "",
2848 "Looking for TRIE'able sequences. Tail node is: ",
2849 SvPV_nolen_const( mysv )
2850 );
2851 });
2852
2853 /*
2854
2855 step through the branches, cur represents each
2856 branch, noper is the first thing to be matched
2857 as part of that branch and noper_next is the
2858 regnext() of that node. if noper is an EXACT
2859 and noper_next is the same as scan (our current
2860 position in the regex) then the EXACT branch is
2861 a possible optimization target. Once we have
486ec47a 2862 two or more consecutive such branches we can
8aa23a47
YO
2863 create a trie of the EXACT's contents and stich
2864 it in place. If the sequence represents all of
2865 the branches we eliminate the whole thing and
2866 replace it with a single TRIE. If it is a
2867 subsequence then we need to stitch it in. This
2868 means the first branch has to remain, and needs
2869 to be repointed at the item on the branch chain
2870 following the last branch optimized. This could
2871 be either a BRANCH, in which case the
2872 subsequence is internal, or it could be the
2873 item following the branch sequence in which
2874 case the subsequence is at the end.
2875
2876 */
2877
2878 /* dont use tail as the end marker for this traverse */
2879 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2880 regnode * const noper = NEXTOPER( cur );
b515a41d 2881#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2882 regnode * const noper_next = regnext( noper );
b515a41d
YO
2883#endif
2884
8aa23a47
YO
2885 DEBUG_OPTIMISE_r({
2886 regprop(RExC_rx, mysv, cur);
2887 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2888 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2889
2890 regprop(RExC_rx, mysv, noper);
2891 PerlIO_printf( Perl_debug_log, " -> %s",
2892 SvPV_nolen_const(mysv));
2893
2894 if ( noper_next ) {
2895 regprop(RExC_rx, mysv, noper_next );
2896 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2897 SvPV_nolen_const(mysv));
2898 }
2899 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2900 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2901 });
2902 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2903 : PL_regkind[ OP( noper ) ] == EXACT )
2904 || OP(noper) == NOTHING )
786e8c11 2905#ifdef NOJUMPTRIE
8aa23a47 2906 && noper_next == tail
786e8c11 2907#endif
8aa23a47
YO
2908 && count < U16_MAX)
2909 {
2910 count++;
2911 if ( !first || optype == NOTHING ) {
2912 if (!first) first = cur;
2913 optype = OP( noper );
2914 } else {
2915 last = cur;
2916 }
2917 } else {
a0a388a1 2918/*
0abd0d78
YO
2919 Currently we do not believe that the trie logic can
2920 handle case insensitive matching properly when the
2921 pattern is not unicode (thus forcing unicode semantics).
2922
2923 If/when this is fixed the following define can be swapped
2924 in below to fully enable trie logic.
2925
a0a388a1 2926#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2927
2928*/
2929#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2930
a0a388a1 2931 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2932 make_trie( pRExC_state,
2933 startbranch, first, cur, tail, count,
2934 optype, depth+1 );
2935 }
2936 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2937#ifdef NOJUMPTRIE
8aa23a47 2938 && noper_next == tail
786e8c11 2939#endif
8aa23a47
YO
2940 ){
2941 count = 1;
2942 first = cur;
2943 optype = OP( noper );
2944 } else {
2945 count = 0;
2946 first = NULL;
2947 optype = 0;
2948 }
2949 last = NULL;
2950 }
2951 }
2952 DEBUG_OPTIMISE_r({
2953 regprop(RExC_rx, mysv, cur);
2954 PerlIO_printf( Perl_debug_log,
2955 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2956 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2957
2958 });
a0a388a1
YO
2959
2960 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2961 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2962#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2963 if ( ((made == MADE_EXACT_TRIE &&
2964 startbranch == first)
2965 || ( first_non_open == first )) &&
2966 depth==0 ) {
2967 flags |= SCF_TRIE_RESTUDY;
2968 if ( startbranch == first
2969 && scan == tail )
2970 {
2971 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2972 }
2973 }
3dab1dad 2974#endif
8aa23a47
YO
2975 }
2976 }
2977
2978 } /* do trie */
2979
653099ff 2980 }
8aa23a47
YO
2981 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2982 scan = NEXTOPER(NEXTOPER(scan));
2983 } else /* single branch is optimized. */
2984 scan = NEXTOPER(scan);
2985 continue;
2986 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2987 scan_frame *newframe = NULL;
2988 I32 paren;
2989 regnode *start;
2990 regnode *end;
2991
2992 if (OP(scan) != SUSPEND) {
2993 /* set the pointer */
2994 if (OP(scan) == GOSUB) {
2995 paren = ARG(scan);
2996 RExC_recurse[ARG2L(scan)] = scan;
2997 start = RExC_open_parens[paren-1];
2998 end = RExC_close_parens[paren-1];
2999 } else {
3000 paren = 0;
f8fc2ecf 3001 start = RExC_rxi->program + 1;
8aa23a47
YO
3002 end = RExC_opend;
3003 }
3004 if (!recursed) {
3005 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3006 SAVEFREEPV(recursed);
3007 }
3008 if (!PAREN_TEST(recursed,paren+1)) {
3009 PAREN_SET(recursed,paren+1);
3010 Newx(newframe,1,scan_frame);
3011 } else {
3012 if (flags & SCF_DO_SUBSTR) {
304ee84b 3013 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3014 data->longest = &(data->longest_float);
3015 }
3016 is_inf = is_inf_internal = 1;
3017 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3018 cl_anything(pRExC_state, data->start_class);
3019 flags &= ~SCF_DO_STCLASS;
3020 }
3021 } else {
3022 Newx(newframe,1,scan_frame);
3023 paren = stopparen;
3024 start = scan+2;
3025 end = regnext(scan);
3026 }
3027 if (newframe) {
3028 assert(start);
3029 assert(end);
3030 SAVEFREEPV(newframe);
3031 newframe->next = regnext(scan);
3032 newframe->last = last;
3033 newframe->stop = stopparen;
3034 newframe->prev = frame;
3035
3036 frame = newframe;
3037 scan = start;
3038 stopparen = paren;
3039 last = end;
3040
3041 continue;
3042 }
3043 }
3044 else if (OP(scan) == EXACT) {
3045 I32 l = STR_LEN(scan);
3046 UV uc;
3047 if (UTF) {
3048 const U8 * const s = (U8*)STRING(scan);
3049 l = utf8_length(s, s + l);
3050 uc = utf8_to_uvchr(s, NULL);
3051 } else {
3052 uc = *((U8*)STRING(scan));
3053 }
3054 min += l;
3055 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3056 /* The code below prefers earlier match for fixed
3057 offset, later match for variable offset. */
3058 if (data->last_end == -1) { /* Update the start info. */
3059 data->last_start_min = data->pos_min;
3060 data->last_start_max = is_inf
3061 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3062 }
8aa23a47
YO
3063 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3064 if (UTF)
3065 SvUTF8_on(data->last_found);
3066 {
3067 SV * const sv = data->last_found;
3068 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3069 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3070 if (mg && mg->mg_len >= 0)
3071 mg->mg_len += utf8_length((U8*)STRING(scan),
3072 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3073 }
8aa23a47
YO
3074 data->last_end = data->pos_min + l;
3075 data->pos_min += l; /* As in the first entry. */
3076 data->flags &= ~SF_BEFORE_EOL;
3077 }
3078 if (flags & SCF_DO_STCLASS_AND) {
3079 /* Check whether it is compatible with what we know already! */
3080 int compat = 1;
3081
54251c2e 3082
486ec47a 3083 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3084 * in the bitmp and either 1) its bit or its fold is set, or 2)
3085 * it's for a locale. Even if there isn't unicode semantics
3086 * here, at runtime there may be because of matching against a
3087 * utf8 string, so accept a possible false positive for
3088 * latin1-range folds */
8aa23a47
YO
3089 if (uc >= 0x100 ||
3090 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3091 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3092 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3093 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47
YO
3094 )
3095 compat = 0;
3096 ANYOF_CLASS_ZERO(data->start_class);
3097 ANYOF_BITMAP_ZERO(data->start_class);
3098 if (compat)
3099 ANYOF_BITMAP_SET(data->start_class, uc);
3100 data->start_class->flags &= ~ANYOF_EOS;
3101 if (uc < 0x100)
3102 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3103 }
3104 else if (flags & SCF_DO_STCLASS_OR) {
3105 /* false positive possible if the class is case-folded */
3106 if (uc < 0x100)
3107 ANYOF_BITMAP_SET(data->start_class, uc);
3108 else
3109 data->start_class->flags |= ANYOF_UNICODE_ALL;
3110 data->start_class->flags &= ~ANYOF_EOS;
3111 cl_and(data->start_class, and_withp);
3112 }
3113 flags &= ~SCF_DO_STCLASS;
3114 }
3115 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3116 I32 l = STR_LEN(scan);
3117 UV uc = *((U8*)STRING(scan));
3118
3119 /* Search for fixed substrings supports EXACT only. */
3120 if (flags & SCF_DO_SUBSTR) {
3121 assert(data);
304ee84b 3122 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3123 }
3124 if (UTF) {
3125 const U8 * const s = (U8 *)STRING(scan);
3126 l = utf8_length(s, s + l);
3127 uc = utf8_to_uvchr(s, NULL);
3128 }
3129 min += l;
3130 if (flags & SCF_DO_SUBSTR)
3131 data->pos_min += l;
3132 if (flags & SCF_DO_STCLASS_AND) {
3133 /* Check whether it is compatible with what we know already! */
3134 int compat = 1;
8aa23a47 3135 if (uc >= 0x100 ||
54251c2e
KW
3136 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3137 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3138 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3139 {
8aa23a47 3140 compat = 0;
54251c2e 3141 }
8aa23a47
YO
3142 ANYOF_CLASS_ZERO(data->start_class);
3143 ANYOF_BITMAP_ZERO(data->start_class);
3144 if (compat) {
3145 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3146 data->start_class->flags &= ~ANYOF_EOS;
39065660 3147 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3148 if (OP(scan) == EXACTFL) {
8aa23a47 3149 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3150 }
3151 else {
3152
54251c2e
KW
3153 /* Also set the other member of the fold pair. In case
3154 * that unicode semantics is called for at runtime, use
3155 * the full latin1 fold. (Can't do this for locale,
3156 * because not known until runtime */
3157 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
970c8436 3158 }
653099ff 3159 }
8aa23a47
YO
3160 }
3161 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3162 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3163 /* false positive possible if the class is case-folded.
3164 Assume that the locale settings are the same... */
970c8436 3165 if (uc < 0x100) {
1aa99e6b 3166 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3167 if (OP(scan) != EXACTFL) {
3168
3169 /* And set the other member of the fold pair, but
3170 * can't do that in locale because not known until
3171 * run-time */
3172 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3173 PL_fold_latin1[uc]);
970c8436
KW
3174 }
3175 }
653099ff
GS
3176 data->start_class->flags &= ~ANYOF_EOS;
3177 }
8aa23a47 3178 cl_and(data->start_class, and_withp);
653099ff 3179 }
8aa23a47
YO
3180 flags &= ~SCF_DO_STCLASS;
3181 }
e52fc539 3182 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3183 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3184 I32 f = flags, pos_before = 0;
3185 regnode * const oscan = scan;
3186 struct regnode_charclass_class this_class;
3187 struct regnode_charclass_class *oclass = NULL;
3188 I32 next_is_eval = 0;
3189
3190 switch (PL_regkind[OP(scan)]) {
3191 case WHILEM: /* End of (?:...)* . */
3192 scan = NEXTOPER(scan);
3193 goto finish;
3194 case PLUS:
3195 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3196 next = NEXTOPER(scan);
3197 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3198 mincount = 1;
3199 maxcount = REG_INFTY;
3200 next = regnext(scan);
3201 scan = NEXTOPER(scan);
3202 goto do_curly;
3203 }
3204 }
3205 if (flags & SCF_DO_SUBSTR)
3206 data->pos_min++;
3207 min++;
3208 /* Fall through. */
3209 case STAR:
3210 if (flags & SCF_DO_STCLASS) {
3211 mincount = 0;
3212 maxcount = REG_INFTY;
3213 next = regnext(scan);
3214 scan = NEXTOPER(scan);
3215 goto do_curly;
3216 }
3217 is_inf = is_inf_internal = 1;
3218 scan = regnext(scan);
c277df42 3219 if (flags & SCF_DO_SUBSTR) {
304ee84b 3220 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3221 data->longest = &(data->longest_float);
c277df42 3222 }
8aa23a47
YO
3223 goto optimize_curly_tail;
3224 case CURLY:
3225 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3226 && (scan->flags == stopparen))
3227 {
3228 mincount = 1;
3229 maxcount = 1;
3230 } else {
3231 mincount = ARG1(scan);
3232 maxcount = ARG2(scan);
653099ff 3233 }
8aa23a47
YO
3234 next = regnext(scan);
3235 if (OP(scan) == CURLYX) {
3236 I32 lp = (data ? *(data->last_closep) : 0);
3237 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3238 }
8aa23a47
YO
3239 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3240 next_is_eval = (OP(scan) == EVAL);
3241 do_curly:
3242 if (flags & SCF_DO_SUBSTR) {
304ee84b 3243 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3244 pos_before = data->pos_min;
b45f050a 3245 }
8aa23a47
YO
3246 if (data) {
3247 fl = data->flags;
3248 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3249 if (is_inf)
3250 data->flags |= SF_IS_INF;
3251 }
3252 if (flags & SCF_DO_STCLASS) {
3253 cl_init(pRExC_state, &this_class);
3254 oclass = data->start_class;
3255 data->start_class = &this_class;
3256 f |= SCF_DO_STCLASS_AND;
3257 f &= ~SCF_DO_STCLASS_OR;
3258 }
779bcb7d
NC
3259 /* Exclude from super-linear cache processing any {n,m}
3260 regops for which the combination of input pos and regex
3261 pos is not enough information to determine if a match
3262 will be possible.
3263
3264 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3265 regex pos at the \s*, the prospects for a match depend not
3266 only on the input position but also on how many (bar\s*)
3267 repeats into the {4,8} we are. */
3268 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3269 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3270
8aa23a47
YO
3271 /* This will finish on WHILEM, setting scan, or on NULL: */
3272 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3273 last, data, stopparen, recursed, NULL,
3274 (mincount == 0
3275 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3276
8aa23a47
YO
3277 if (flags & SCF_DO_STCLASS)
3278 data->start_class = oclass;
3279 if (mincount == 0 || minnext == 0) {
3280 if (flags & SCF_DO_STCLASS_OR) {
3281 cl_or(pRExC_state, data->start_class, &this_class);
3282 }
3283 else if (flags & SCF_DO_STCLASS_AND) {
3284 /* Switch to OR mode: cache the old value of
3285 * data->start_class */
3286 INIT_AND_WITHP;
3287 StructCopy(data->start_class, and_withp,
3288 struct regnode_charclass_class);
3289 flags &= ~SCF_DO_STCLASS_AND;
3290 StructCopy(&this_class, data->start_class,
3291 struct regnode_charclass_class);
3292 flags |= SCF_DO_STCLASS_OR;
3293 data->start_class->flags |= ANYOF_EOS;
3294 }
3295 } else { /* Non-zero len */
3296 if (flags & SCF_DO_STCLASS_OR) {
3297 cl_or(pRExC_state, data->start_class, &this_class);
3298 cl_and(data->start_class, and_withp);
3299 }
3300 else if (flags & SCF_DO_STCLASS_AND)
3301 cl_and(data->start_class, &this_class);
3302 flags &= ~SCF_DO_STCLASS;
3303 }
3304 if (!scan) /* It was not CURLYX, but CURLY. */
3305 scan = next;
3306 if ( /* ? quantifier ok, except for (?{ ... }) */
3307 (next_is_eval || !(mincount == 0 && maxcount == 1))
3308 && (minnext == 0) && (deltanext == 0)
3309 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3310 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3311 {
668c081a
NC
3312 ckWARNreg(RExC_parse,
3313 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3314 }
3315
3316 min += minnext * mincount;
3317 is_inf_internal |= ((maxcount == REG_INFTY
3318 && (minnext + deltanext) > 0)
3319 || deltanext == I32_MAX);
3320 is_inf |= is_inf_internal;
3321 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3322
3323 /* Try powerful optimization CURLYX => CURLYN. */
3324 if ( OP(oscan) == CURLYX && data
3325 && data->flags & SF_IN_PAR
3326 && !(data->flags & SF_HAS_EVAL)
3327 && !deltanext && minnext == 1 ) {
3328 /* Try to optimize to CURLYN. */
3329 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3330 regnode * const nxt1 = nxt;
497b47a8 3331#ifdef DEBUGGING
8aa23a47 3332 regnode *nxt2;
497b47a8 3333#endif
c277df42 3334
8aa23a47
YO
3335 /* Skip open. */
3336 nxt = regnext(nxt);
e52fc539 3337 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3338 && !(PL_regkind[OP(nxt)] == EXACT
3339 && STR_LEN(nxt) == 1))
3340 goto nogo;
497b47a8 3341#ifdef DEBUGGING
8aa23a47 3342 nxt2 = nxt;
497b47a8 3343#endif
8aa23a47
YO
3344 nxt = regnext(nxt);
3345 if (OP(nxt) != CLOSE)
3346 goto nogo;
3347 if (RExC_open_parens) {
3348 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3349 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3350 }
3351 /* Now we know that nxt2 is the only contents: */
3352 oscan->flags = (U8)ARG(nxt);
3353 OP(oscan) = CURLYN;
3354 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3355
c277df42 3356#ifdef DEBUGGING
8aa23a47 3357 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3358 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3359 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3360 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3361 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3362 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3363#endif
8aa23a47
YO
3364 }
3365 nogo:
3366
3367 /* Try optimization CURLYX => CURLYM. */
3368 if ( OP(oscan) == CURLYX && data
3369 && !(data->flags & SF_HAS_PAR)
3370 && !(data->flags & SF_HAS_EVAL)
3371 && !deltanext /* atom is fixed width */
3372 && minnext != 0 /* CURLYM can't handle zero width */
3373 ) {
3374 /* XXXX How to optimize if data == 0? */
3375 /* Optimize to a simpler form. */
3376 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3377 regnode *nxt2;
3378
3379 OP(oscan) = CURLYM;
3380 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3381 && (OP(nxt2) != WHILEM))
3382 nxt = nxt2;
3383 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3384 /* Need to optimize away parenths. */
b3c0965f 3385 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3386 /* Set the parenth number. */
3387 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3388
8aa23a47
YO
3389 oscan->flags = (U8)ARG(nxt);
3390 if (RExC_open_parens) {
3391 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3392 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3393 }
8aa23a47
YO
3394 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3395 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3396
c277df42 3397#ifdef DEBUGGING
8aa23a47
YO
3398 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3399 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PAR
3400 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3401 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3402#endif
c277df42 3403#if 0
8aa23a47
YO
3404 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3405 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3406 if (nnxt == nxt) {
3407 if (reg_off_by_arg[OP(nxt1)])
3408 ARG_SET(nxt1, nxt2 - nxt1);
3409 else if (nxt2 - nxt1 < U16_MAX)
3410 NEXT_OFF(nxt1) = nxt2 - nxt1;
3411 else
3412 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3413 }
8aa23a47 3414 nxt1 = nnxt;
c277df42 3415 }
5d1c421c 3416#endif
8aa23a47
YO
3417 /* Optimize again: */
3418 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3419 NULL, stopparen, recursed, NULL, 0,depth+1);
3420 }
3421 else
3422 oscan->flags = 0;
3423 }
3424 else if ((OP(oscan) == CURLYX)
3425 && (flags & SCF_WHILEM_VISITED_POS)
3426 /* See the comment on a similar expression above.
3b753521 3427 However, this time it's not a subexpression
8aa23a47
YO
3428 we care about, but the expression itself. */
3429 && (maxcount == REG_INFTY)
3430 && data && ++data->whilem_c < 16) {
3431 /* This stays as CURLYX, we can put the count/of pair. */
3432 /* Find WHILEM (as in regexec.c) */
3433 regnode *nxt = oscan + NEXT_OFF(oscan);
3434
3435 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3436 nxt += ARG(nxt);
3437 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3438 | (RExC_whilem_seen << 4)); /* On WHILEM */
3439 }
3440 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3441 pars++;
3442 if (flags & SCF_DO_SUBSTR) {
3443 SV *last_str = NULL;
3444 int counted = mincount != 0;
a0ed51b3 3445
8aa23a47
YO
3446 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3447#if defined(SPARC64_GCC_WORKAROUND)
3448 I32 b = 0;
3449 STRLEN l = 0;
3450 const char *s = NULL;
3451 I32 old = 0;
b515a41d 3452
8aa23a47
YO
3453 if (pos_before >= data->last_start_min)
3454 b = pos_before;
3455 else
3456 b = data->last_start_min;
b515a41d 3457
8aa23a47
YO
3458 l = 0;
3459 s = SvPV_const(data->last_found, l);
3460 old = b - data->last_start_min;
3461
3462#else
3463 I32 b = pos_before >= data->last_start_min
3464 ? pos_before : data->last_start_min;
3465 STRLEN l;
3466 const char * const s = SvPV_const(data->last_found, l);
3467 I32 old = b - data->last_start_min;
3468#endif
3469
3470 if (UTF)
3471 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3472 l -= old;
3473 /* Get the added string: */
740cce10 3474 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3475 if (deltanext == 0 && pos_before == b) {
3476 /* What was added is a constant string */
3477 if (mincount > 1) {
3478 SvGROW(last_str, (mincount * l) + 1);
3479 repeatcpy(SvPVX(last_str) + l,
3480 SvPVX_const(last_str), l, mincount - 1);
3481 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3482 /* Add additional parts. */
3483 SvCUR_set(data->last_found,
3484 SvCUR(data->last_found) - l);
3485 sv_catsv(data->last_found, last_str);
3486 {
3487 SV * sv = data->last_found;
3488 MAGIC *mg =
3489 SvUTF8(sv) && SvMAGICAL(sv) ?
3490 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3491 if (mg && mg->mg_len >= 0)
bd94e887 3492 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3493 }
8aa23a47 3494 data->last_end += l * (mincount - 1);
b515a41d 3495 }
8aa23a47
YO
3496 } else {
3497 /* start offset must point into the last copy */
3498 data->last_start_min += minnext * (mincount - 1);
3499 data->last_start_max += is_inf ? I32_MAX
3500 : (maxcount - 1) * (minnext + data->pos_delta);
3501 }
c277df42 3502 }
8aa23a47
YO
3503 /* It is counted once already... */
3504 data->pos_min += minnext * (mincount - counted);
3505 data->pos_delta += - counted * deltanext +
3506 (minnext + deltanext) * maxcount - minnext * mincount;
3507 if (mincount != maxcount) {
3508 /* Cannot extend fixed substrings found inside
3509 the group. */
304ee84b 3510 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3511 if (mincount && last_str) {
3512 SV * const sv = data->last_found;
3513 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3514 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3515
3516 if (mg)
3517 mg->mg_len = -1;
3518 sv_setsv(sv, last_str);
3519 data->last_end = data->pos_min;
3520 data->last_start_min =
3521 data->pos_min - CHR_SVLEN(last_str);
3522 data->last_start_max = is_inf
3523 ? I32_MAX
3524 : data->pos_min + data->pos_delta
3525 - CHR_SVLEN(last_str);
3526 }
3527 data->longest = &(data->longest_float);
3528 }
3529 SvREFCNT_dec(last_str);
c277df42 3530 }
8aa23a47
YO
3531 if (data && (fl & SF_HAS_EVAL))
3532 data->flags |= SF_HAS_EVAL;
3533 optimize_curly_tail:
3534 if (OP(oscan) != CURLYX) {
3535 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3536 && NEXT_OFF(next))
3537 NEXT_OFF(oscan) += NEXT_OFF(next);
3538 }
3539 continue;
f56b6394 3540 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3541 if (flags & SCF_DO_SUBSTR) {
304ee84b 3542 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3543 data->longest = &(data->longest_float);
3544 }
3545 is_inf = is_inf_internal = 1;
3546 if (flags & SCF_DO_STCLASS_OR)
3547 cl_anything(pRExC_state, data->start_class);
3548 flags &= ~SCF_DO_STCLASS;
3549 break;
c277df42 3550 }
8aa23a47 3551 }
e1d1eefb
YO
3552 else if (OP(scan) == LNBREAK) {
3553 if (flags & SCF_DO_STCLASS) {
3554 int value = 0;
3555 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3556 if (flags & SCF_DO_STCLASS_AND) {
3557 for (value = 0; value < 256; value++)
e64b1bd1 3558 if (!is_VERTWS_cp(value))
b9a59e08
KW
3559 ANYOF_BITMAP_CLEAR(data->start_class, value);
3560 }
3561 else {
e1d1eefb 3562 for (value = 0; value < 256; value++)
e64b1bd1 3563 if (is_VERTWS_cp(value))
b9a59e08
KW
3564 ANYOF_BITMAP_SET(data->start_class, value);
3565 }
e1d1eefb
YO
3566 if (flags & SCF_DO_STCLASS_OR)
3567 cl_and(data->start_class, and_withp);
3568 flags &= ~SCF_DO_STCLASS;
3569 }
3570 min += 1;
f9a79580 3571 delta += 1;
e1d1eefb
YO
3572 if (flags & SCF_DO_SUBSTR) {
3573 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3574 data->pos_min += 1;
f9a79580 3575 data->pos_delta += 1;
e1d1eefb
YO
3576 data->longest = &(data->longest_float);
3577 }
e1d1eefb 3578 }
f9a79580 3579 else if (OP(scan) == FOLDCHAR) {
ced7f090 3580 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
f9a79580
RGS
3581 flags &= ~SCF_DO_STCLASS;
3582 min += 1;
3583 delta += d;
3584 if (flags & SCF_DO_SUBSTR) {
3585 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3586 data->pos_min += 1;
3587 data->pos_delta += d;
3588 data->longest = &(data->longest_float);
3589 }
3590 }
e52fc539 3591 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 3592 int value = 0;
653099ff 3593
8aa23a47 3594 if (flags & SCF_DO_SUBSTR) {
304ee84b 3595 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3596 data->pos_min++;
3597 }
3598 min++;
3599 if (flags & SCF_DO_STCLASS) {
3600 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3601
8aa23a47
YO
3602 /* Some of the logic below assumes that switching
3603 locale on will only add false positives. */
3604 switch (PL_regkind[OP(scan)]) {
3605 case SANY:
3606 default:
3607 do_default:
3608 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3609 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3610 cl_anything(pRExC_state, data->start_class);
3611 break;
3612 case REG_ANY:
3613 if (OP(scan) == SANY)
3614 goto do_default;
3615 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3616 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 3617 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
8aa23a47 3618 cl_anything(pRExC_state, data->start_class);
653099ff 3619 }
8aa23a47
YO
3620 if (flags & SCF_DO_STCLASS_AND || !value)
3621 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3622 break;
3623 case ANYOF:
3624 if (flags & SCF_DO_STCLASS_AND)
3625 cl_and(data->start_class,
3626 (struct regnode_charclass_class*)scan);
653099ff 3627 else
8aa23a47
YO
3628 cl_or(pRExC_state, data->start_class,
3629 (struct regnode_charclass_class*)scan);
3630 break;
3631 case ALNUM:
3632 if (flags & SCF_DO_STCLASS_AND) {
3633 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3634 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
b988e673 3635 if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
a12cf05f
KW
3636 for (value = 0; value < 256; value++) {
3637 if (!isWORDCHAR_L1(value)) {
3638 ANYOF_BITMAP_CLEAR(data->start_class, value);
3639 }
3640 }
3641 } else {
3642 for (value = 0; value < 256; value++) {
3643 if (!isALNUM(value)) {
3644 ANYOF_BITMAP_CLEAR(data->start_class, value);
3645 }
3646 }
3647 }
8aa23a47 3648 }
653099ff 3649 }
8aa23a47
YO
3650 else {
3651 if (data->start_class->flags & ANYOF_LOCALE)
3652 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
b988e673 3653 else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
a12cf05f
KW
3654 for (value = 0; value < 256; value++) {
3655 if (isWORDCHAR_L1(value)) {
3656 ANYOF_BITMAP_SET(data->start_class, value);
3657 }
3658 }
3659 } else {
3660 for (value = 0; value < 256; value++) {
3661 if (isALNUM(value)) {
3662 ANYOF_BITMAP_SET(data->start_class, value);
3663 }
3664 }
3665 }
8aa23a47
YO
3666 }
3667 break;
8aa23a47
YO
3668 case NALNUM:
3669 if (flags & SCF_DO_STCLASS_AND) {
3670 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3671 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
b988e673 3672 if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
a12cf05f
KW
3673 for (value = 0; value < 256; value++) {
3674 if (isWORDCHAR_L1(value)) {
3675 ANYOF_BITMAP_CLEAR(data->start_class, value);
3676 }
3677 }
3678 } else {
3679 for (value = 0; value < 256; value++) {
3680 if (isALNUM(value)) {
3681 ANYOF_BITMAP_CLEAR(data->start_class, value);
3682 }
3683 }
3684 }
653099ff
GS
3685 }
3686 }
8aa23a47
YO
3687 else {
3688 if (data->start_class->flags & ANYOF_LOCALE)
3689