This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Correct the paramter to Perl_op_xmldump(). The one that got away from
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
288b8c02 105 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
106 regexp *rx; /* perl core regexp structure */
107 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 108 char *start; /* Start of input for compile */
830247a4
IZ
109 char *end; /* End of input for compile */
110 char *parse; /* Input-scan pointer. */
111 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 112 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 113 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 114 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
115 I32 naughty; /* How bad is this pattern? */
116 I32 sawback; /* Did we see \1, ...? */
117 U32 seen;
118 I32 size; /* Code size. */
c74340f9
YO
119 I32 npar; /* Capture buffer count, (OPEN). */
120 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 121 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
122 I32 extralen;
123 I32 seen_zerolen;
124 I32 seen_evals;
40d049e4
YO
125 regnode **open_parens; /* pointers to open parens */
126 regnode **close_parens; /* pointers to close parens */
127 regnode *opend; /* END node in program */
02daf0ab
YO
128 I32 utf8; /* whether the pattern is utf8 or not */
129 I32 orig_utf8; /* whether the pattern was originally in utf8 */
130 /* XXX use this for future optimisation of case
131 * where pattern must be upgraded to utf8. */
6bda09f9 132 HV *charnames; /* cache of named sequences */
81714fb9 133 HV *paren_names; /* Paren names */
1f1031fe 134
40d049e4
YO
135 regnode **recurse; /* Recurse regops */
136 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
137#if ADD_TO_REGEXEC
138 char *starttry; /* -Dr: where regtry was called. */
139#define RExC_starttry (pRExC_state->starttry)
140#endif
3dab1dad 141#ifdef DEBUGGING
be8e71aa 142 const char *lastparse;
3dab1dad 143 I32 lastnum;
1f1031fe 144 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
145#define RExC_lastparse (pRExC_state->lastparse)
146#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 147#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 148#endif
830247a4
IZ
149} RExC_state_t;
150
e2509266 151#define RExC_flags (pRExC_state->flags)
830247a4 152#define RExC_precomp (pRExC_state->precomp)
288b8c02 153#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 154#define RExC_rx (pRExC_state->rx)
f8fc2ecf 155#define RExC_rxi (pRExC_state->rxi)
fac92740 156#define RExC_start (pRExC_state->start)
830247a4
IZ
157#define RExC_end (pRExC_state->end)
158#define RExC_parse (pRExC_state->parse)
159#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
160#ifdef RE_TRACK_PATTERN_OFFSETS
161#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
162#endif
830247a4 163#define RExC_emit (pRExC_state->emit)
fac92740 164#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 165#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
166#define RExC_naughty (pRExC_state->naughty)
167#define RExC_sawback (pRExC_state->sawback)
168#define RExC_seen (pRExC_state->seen)
169#define RExC_size (pRExC_state->size)
170#define RExC_npar (pRExC_state->npar)
e2e6a0f1 171#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
172#define RExC_extralen (pRExC_state->extralen)
173#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
174#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 175#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 176#define RExC_orig_utf8 (pRExC_state->orig_utf8)
fc8cd66c 177#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
178#define RExC_open_parens (pRExC_state->open_parens)
179#define RExC_close_parens (pRExC_state->close_parens)
180#define RExC_opend (pRExC_state->opend)
81714fb9 181#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
182#define RExC_recurse (pRExC_state->recurse)
183#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 184
cde0cee5 185
a687059c
LW
186#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
187#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
188 ((*s) == '{' && regcurly(s)))
a687059c 189
35c8bce7
LW
190#ifdef SPSTART
191#undef SPSTART /* dratted cpp namespace... */
192#endif
a687059c
LW
193/*
194 * Flags to be passed up and down.
195 */
a687059c 196#define WORST 0 /* Worst case. */
a3b492c3
YO
197#define HASWIDTH 0x01 /* Known to match non-null strings. */
198#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
199#define SPSTART 0x04 /* Starts with * or +. */
200#define TRYAGAIN 0x08 /* Weeded out a declaration. */
201#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 202
3dab1dad
YO
203#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
204
07be1b83
YO
205/* whether trie related optimizations are enabled */
206#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
207#define TRIE_STUDY_OPT
786e8c11 208#define FULL_TRIE_STUDY
07be1b83
YO
209#define TRIE_STCLASS
210#endif
1de06328
YO
211
212
40d049e4
YO
213
214#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
215#define PBITVAL(paren) (1 << ((paren) & 7))
216#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
217#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
218#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
219
220
1de06328
YO
221/* About scan_data_t.
222
223 During optimisation we recurse through the regexp program performing
224 various inplace (keyhole style) optimisations. In addition study_chunk
225 and scan_commit populate this data structure with information about
226 what strings MUST appear in the pattern. We look for the longest
227 string that must appear for at a fixed location, and we look for the
228 longest string that may appear at a floating location. So for instance
229 in the pattern:
230
231 /FOO[xX]A.*B[xX]BAR/
232
233 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
234 strings (because they follow a .* construct). study_chunk will identify
235 both FOO and BAR as being the longest fixed and floating strings respectively.
236
237 The strings can be composites, for instance
238
239 /(f)(o)(o)/
240
241 will result in a composite fixed substring 'foo'.
242
243 For each string some basic information is maintained:
244
245 - offset or min_offset
246 This is the position the string must appear at, or not before.
247 It also implicitly (when combined with minlenp) tells us how many
248 character must match before the string we are searching.
249 Likewise when combined with minlenp and the length of the string
250 tells us how many characters must appear after the string we have
251 found.
252
253 - max_offset
254 Only used for floating strings. This is the rightmost point that
255 the string can appear at. Ifset to I32 max it indicates that the
256 string can occur infinitely far to the right.
257
258 - minlenp
259 A pointer to the minimum length of the pattern that the string
260 was found inside. This is important as in the case of positive
261 lookahead or positive lookbehind we can have multiple patterns
262 involved. Consider
263
264 /(?=FOO).*F/
265
266 The minimum length of the pattern overall is 3, the minimum length
267 of the lookahead part is 3, but the minimum length of the part that
268 will actually match is 1. So 'FOO's minimum length is 3, but the
269 minimum length for the F is 1. This is important as the minimum length
270 is used to determine offsets in front of and behind the string being
271 looked for. Since strings can be composites this is the length of the
272 pattern at the time it was commited with a scan_commit. Note that
273 the length is calculated by study_chunk, so that the minimum lengths
274 are not known until the full pattern has been compiled, thus the
275 pointer to the value.
276
277 - lookbehind
278
279 In the case of lookbehind the string being searched for can be
280 offset past the start point of the final matching string.
281 If this value was just blithely removed from the min_offset it would
282 invalidate some of the calculations for how many chars must match
283 before or after (as they are derived from min_offset and minlen and
284 the length of the string being searched for).
285 When the final pattern is compiled and the data is moved from the
286 scan_data_t structure into the regexp structure the information
287 about lookbehind is factored in, with the information that would
288 have been lost precalculated in the end_shift field for the
289 associated string.
290
291 The fields pos_min and pos_delta are used to store the minimum offset
292 and the delta to the maximum offset at the current point in the pattern.
293
294*/
2c2d71f5
JH
295
296typedef struct scan_data_t {
1de06328
YO
297 /*I32 len_min; unused */
298 /*I32 len_delta; unused */
2c2d71f5
JH
299 I32 pos_min;
300 I32 pos_delta;
301 SV *last_found;
1de06328 302 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
303 I32 last_start_min;
304 I32 last_start_max;
1de06328
YO
305 SV **longest; /* Either &l_fixed, or &l_float. */
306 SV *longest_fixed; /* longest fixed string found in pattern */
307 I32 offset_fixed; /* offset where it starts */
308 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
309 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
310 SV *longest_float; /* longest floating string found in pattern */
311 I32 offset_float_min; /* earliest point in string it can appear */
312 I32 offset_float_max; /* latest point in string it can appear */
313 I32 *minlen_float; /* pointer to the minlen relevent to the string */
314 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
315 I32 flags;
316 I32 whilem_c;
cb434fcc 317 I32 *last_closep;
653099ff 318 struct regnode_charclass_class *start_class;
2c2d71f5
JH
319} scan_data_t;
320
a687059c 321/*
e50aee73 322 * Forward declarations for pregcomp()'s friends.
a687059c 323 */
a0d0e21e 324
27da23d5 325static const scan_data_t zero_scan_data =
1de06328 326 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
327
328#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
329#define SF_BEFORE_SEOL 0x0001
330#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
331#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
332#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
333
09b7f37c
CB
334#ifdef NO_UNARY_PLUS
335# define SF_FIX_SHIFT_EOL (0+2)
336# define SF_FL_SHIFT_EOL (0+4)
337#else
338# define SF_FIX_SHIFT_EOL (+2)
339# define SF_FL_SHIFT_EOL (+4)
340#endif
c277df42
IZ
341
342#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
343#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
344
345#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
346#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
347#define SF_IS_INF 0x0040
348#define SF_HAS_PAR 0x0080
349#define SF_IN_PAR 0x0100
350#define SF_HAS_EVAL 0x0200
351#define SCF_DO_SUBSTR 0x0400
653099ff
GS
352#define SCF_DO_STCLASS_AND 0x0800
353#define SCF_DO_STCLASS_OR 0x1000
354#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 355#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 356
786e8c11 357#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 358#define SCF_SEEN_ACCEPT 0x8000
07be1b83 359
eb160463 360#define UTF (RExC_utf8 != 0)
bbe252da
YO
361#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
362#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 363
ffc61ed2 364#define OOB_UNICODE 12345678
93733859 365#define OOB_NAMEDCLASS -1
b8c5462f 366
a0ed51b3
LW
367#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
368#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
369
8615cb43 370
b45f050a
JF
371/* length of regex to show in messages that don't mark a position within */
372#define RegexLengthToShowInErrorMessages 127
373
374/*
375 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
376 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
377 * op/pragma/warn/regcomp.
378 */
7253e4e3
RK
379#define MARKER1 "<-- HERE" /* marker as it appears in the description */
380#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 381
7253e4e3 382#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
383
384/*
385 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
386 * arg. Show regex, up to a maximum length. If it's too long, chop and add
387 * "...".
388 */
58e23c8d 389#define _FAIL(code) STMT_START { \
bfed75c6 390 const char *ellipses = ""; \
ccb2c380
MP
391 IV len = RExC_end - RExC_precomp; \
392 \
393 if (!SIZE_ONLY) \
288b8c02 394 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
395 if (len > RegexLengthToShowInErrorMessages) { \
396 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
397 len = RegexLengthToShowInErrorMessages - 10; \
398 ellipses = "..."; \
399 } \
58e23c8d 400 code; \
ccb2c380 401} STMT_END
8615cb43 402
58e23c8d
YO
403#define FAIL(msg) _FAIL( \
404 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
405 msg, (int)len, RExC_precomp, ellipses))
406
407#define FAIL2(msg,arg) _FAIL( \
408 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
409 arg, (int)len, RExC_precomp, ellipses))
410
b45f050a 411/*
b45f050a
JF
412 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
413 */
ccb2c380 414#define Simple_vFAIL(m) STMT_START { \
a28509cc 415 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
416 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
417 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
418} STMT_END
b45f050a
JF
419
420/*
421 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
422 */
ccb2c380
MP
423#define vFAIL(m) STMT_START { \
424 if (!SIZE_ONLY) \
288b8c02 425 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
426 Simple_vFAIL(m); \
427} STMT_END
b45f050a
JF
428
429/*
430 * Like Simple_vFAIL(), but accepts two arguments.
431 */
ccb2c380 432#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 433 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
434 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
435 (int)offset, RExC_precomp, RExC_precomp + offset); \
436} STMT_END
b45f050a
JF
437
438/*
439 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
440 */
ccb2c380
MP
441#define vFAIL2(m,a1) STMT_START { \
442 if (!SIZE_ONLY) \
288b8c02 443 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
444 Simple_vFAIL2(m, a1); \
445} STMT_END
b45f050a
JF
446
447
448/*
449 * Like Simple_vFAIL(), but accepts three arguments.
450 */
ccb2c380 451#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 452 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
453 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
454 (int)offset, RExC_precomp, RExC_precomp + offset); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
459 */
ccb2c380
MP
460#define vFAIL3(m,a1,a2) STMT_START { \
461 if (!SIZE_ONLY) \
288b8c02 462 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
463 Simple_vFAIL3(m, a1, a2); \
464} STMT_END
b45f050a
JF
465
466/*
467 * Like Simple_vFAIL(), but accepts four arguments.
468 */
ccb2c380 469#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 470 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
471 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
472 (int)offset, RExC_precomp, RExC_precomp + offset); \
473} STMT_END
b45f050a 474
ccb2c380 475#define vWARN(loc,m) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
478 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARNdep(loc,m) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
484 "%s" REPORT_LOCATION, \
485 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
487
488
489#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 490 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
491 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493} STMT_END
494
495#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 496 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
500
501#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 502 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
503 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
505} STMT_END
506
507#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
511} STMT_END
9d1d55b5 512
8615cb43 513
cd439c50 514/* Allow for side effects in s */
ccb2c380
MP
515#define REGC(c,s) STMT_START { \
516 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
517} STMT_END
cd439c50 518
fac92740
MJD
519/* Macros for recording node offsets. 20001227 mjd@plover.com
520 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
521 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
522 * Element 0 holds the number n.
07be1b83 523 * Position is 1 indexed.
fac92740 524 */
7122b237
YO
525#ifndef RE_TRACK_PATTERN_OFFSETS
526#define Set_Node_Offset_To_R(node,byte)
527#define Set_Node_Offset(node,byte)
528#define Set_Cur_Node_Offset
529#define Set_Node_Length_To_R(node,len)
530#define Set_Node_Length(node,len)
531#define Set_Node_Cur_Length(node)
532#define Node_Offset(n)
533#define Node_Length(n)
534#define Set_Node_Offset_Length(node,offset,len)
535#define ProgLen(ri) ri->u.proglen
536#define SetProgLen(ri,x) ri->u.proglen = x
537#else
538#define ProgLen(ri) ri->u.offsets[0]
539#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
540#define Set_Node_Offset_To_R(node,byte) STMT_START { \
541 if (! SIZE_ONLY) { \
542 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 543 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 544 if((node) < 0) { \
551405c4 545 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
546 } else { \
547 RExC_offsets[2*(node)-1] = (byte); \
548 } \
549 } \
550} STMT_END
551
552#define Set_Node_Offset(node,byte) \
553 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
554#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
555
556#define Set_Node_Length_To_R(node,len) STMT_START { \
557 if (! SIZE_ONLY) { \
558 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 559 __LINE__, (int)(node), (int)(len))); \
ccb2c380 560 if((node) < 0) { \
551405c4 561 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
562 } else { \
563 RExC_offsets[2*(node)] = (len); \
564 } \
565 } \
566} STMT_END
567
568#define Set_Node_Length(node,len) \
569 Set_Node_Length_To_R((node)-RExC_emit_start, len)
570#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
571#define Set_Node_Cur_Length(node) \
572 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
573
574/* Get offsets and lengths */
575#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
576#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
577
07be1b83
YO
578#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
579 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
580 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
581} STMT_END
7122b237 582#endif
07be1b83
YO
583
584#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
585#define EXPERIMENTAL_INPLACESCAN
7122b237 586#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 587
304ee84b
YO
588#define DEBUG_STUDYDATA(str,data,depth) \
589DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 590 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
591 "%*s" str "Pos:%"IVdf"/%"IVdf \
592 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
593 (int)(depth)*2, "", \
594 (IV)((data)->pos_min), \
595 (IV)((data)->pos_delta), \
304ee84b 596 (UV)((data)->flags), \
1de06328 597 (IV)((data)->whilem_c), \
304ee84b
YO
598 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
599 is_inf ? "INF " : "" \
1de06328
YO
600 ); \
601 if ((data)->last_found) \
602 PerlIO_printf(Perl_debug_log, \
603 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
604 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
605 SvPVX_const((data)->last_found), \
606 (IV)((data)->last_end), \
607 (IV)((data)->last_start_min), \
608 (IV)((data)->last_start_max), \
609 ((data)->longest && \
610 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
611 SvPVX_const((data)->longest_fixed), \
612 (IV)((data)->offset_fixed), \
613 ((data)->longest && \
614 (data)->longest==&((data)->longest_float)) ? "*" : "", \
615 SvPVX_const((data)->longest_float), \
616 (IV)((data)->offset_float_min), \
617 (IV)((data)->offset_float_max) \
618 ); \
619 PerlIO_printf(Perl_debug_log,"\n"); \
620});
621
acfe0abc 622static void clear_re(pTHX_ void *r);
4327152a 623
653099ff 624/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 625 Update the longest found anchored substring and the longest found
653099ff
GS
626 floating substrings if needed. */
627
4327152a 628STATIC void
304ee84b 629S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 630{
e1ec3a88
AL
631 const STRLEN l = CHR_SVLEN(data->last_found);
632 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 633 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 634
c277df42 635 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 636 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
637 if (*data->longest == data->longest_fixed) {
638 data->offset_fixed = l ? data->last_start_min : data->pos_min;
639 if (data->flags & SF_BEFORE_EOL)
b81d288d 640 data->flags
c277df42
IZ
641 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
642 else
643 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
644 data->minlen_fixed=minlenp;
645 data->lookbehind_fixed=0;
a0ed51b3 646 }
304ee84b 647 else { /* *data->longest == data->longest_float */
c277df42 648 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
649 data->offset_float_max = (l
650 ? data->last_start_max
c277df42 651 : data->pos_min + data->pos_delta);
304ee84b 652 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 653 data->offset_float_max = I32_MAX;
c277df42 654 if (data->flags & SF_BEFORE_EOL)
b81d288d 655 data->flags
c277df42
IZ
656 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
657 else
658 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
659 data->minlen_float=minlenp;
660 data->lookbehind_float=0;
c277df42
IZ
661 }
662 }
663 SvCUR_set(data->last_found, 0);
0eda9292 664 {
a28509cc 665 SV * const sv = data->last_found;
097eb12c
AL
666 if (SvUTF8(sv) && SvMAGICAL(sv)) {
667 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
668 if (mg)
669 mg->mg_len = 0;
670 }
0eda9292 671 }
c277df42
IZ
672 data->last_end = -1;
673 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 674 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
675}
676
653099ff
GS
677/* Can match anything (initialization) */
678STATIC void
097eb12c 679S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 680{
653099ff 681 ANYOF_CLASS_ZERO(cl);
f8bef550 682 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 683 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
684 if (LOC)
685 cl->flags |= ANYOF_LOCALE;
686}
687
688/* Can match anything (initialization) */
689STATIC int
5f66b61c 690S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
691{
692 int value;
693
aaa51d5e 694 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
695 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
696 return 1;
1aa99e6b
IH
697 if (!(cl->flags & ANYOF_UNICODE_ALL))
698 return 0;
10edeb5d 699 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 700 return 0;
653099ff
GS
701 return 1;
702}
703
704/* Can match anything (initialization) */
705STATIC void
097eb12c 706S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 707{
8ecf7187 708 Zero(cl, 1, struct regnode_charclass_class);
653099ff 709 cl->type = ANYOF;
830247a4 710 cl_anything(pRExC_state, cl);
653099ff
GS
711}
712
713STATIC void
097eb12c 714S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 715{
8ecf7187 716 Zero(cl, 1, struct regnode_charclass_class);
653099ff 717 cl->type = ANYOF;
830247a4 718 cl_anything(pRExC_state, cl);
653099ff
GS
719 if (LOC)
720 cl->flags |= ANYOF_LOCALE;
721}
722
723/* 'And' a given class with another one. Can create false positives */
724/* We assume that cl is not inverted */
725STATIC void
5f66b61c 726S_cl_and(struct regnode_charclass_class *cl,
a28509cc 727 const struct regnode_charclass_class *and_with)
653099ff 728{
40d049e4
YO
729
730 assert(and_with->type == ANYOF);
653099ff
GS
731 if (!(and_with->flags & ANYOF_CLASS)
732 && !(cl->flags & ANYOF_CLASS)
733 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
734 && !(and_with->flags & ANYOF_FOLD)
735 && !(cl->flags & ANYOF_FOLD)) {
736 int i;
737
738 if (and_with->flags & ANYOF_INVERT)
739 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
740 cl->bitmap[i] &= ~and_with->bitmap[i];
741 else
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] &= and_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
745 if (!(and_with->flags & ANYOF_EOS))
746 cl->flags &= ~ANYOF_EOS;
1aa99e6b 747
14ebb1a2
JH
748 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
749 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
750 cl->flags &= ~ANYOF_UNICODE_ALL;
751 cl->flags |= ANYOF_UNICODE;
752 ARG_SET(cl, ARG(and_with));
753 }
14ebb1a2
JH
754 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
755 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 756 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
757 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
758 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 759 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
760}
761
762/* 'OR' a given class with another one. Can create false positives */
763/* We assume that cl is not inverted */
764STATIC void
097eb12c 765S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 766{
653099ff
GS
767 if (or_with->flags & ANYOF_INVERT) {
768 /* We do not use
769 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
770 * <= (B1 | !B2) | (CL1 | !CL2)
771 * which is wasteful if CL2 is small, but we ignore CL2:
772 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
773 * XXXX Can we handle case-fold? Unclear:
774 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
775 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
776 */
777 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
778 && !(or_with->flags & ANYOF_FOLD)
779 && !(cl->flags & ANYOF_FOLD) ) {
780 int i;
781
782 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
783 cl->bitmap[i] |= ~or_with->bitmap[i];
784 } /* XXXX: logic is complicated otherwise */
785 else {
830247a4 786 cl_anything(pRExC_state, cl);
653099ff
GS
787 }
788 } else {
789 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
790 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 791 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
792 || (cl->flags & ANYOF_FOLD)) ) {
793 int i;
794
795 /* OR char bitmap and class bitmap separately */
796 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
797 cl->bitmap[i] |= or_with->bitmap[i];
798 if (or_with->flags & ANYOF_CLASS) {
799 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
800 cl->classflags[i] |= or_with->classflags[i];
801 cl->flags |= ANYOF_CLASS;
802 }
803 }
804 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 805 cl_anything(pRExC_state, cl);
653099ff
GS
806 }
807 }
808 if (or_with->flags & ANYOF_EOS)
809 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
810
811 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
812 ARG(cl) != ARG(or_with)) {
813 cl->flags |= ANYOF_UNICODE_ALL;
814 cl->flags &= ~ANYOF_UNICODE;
815 }
816 if (or_with->flags & ANYOF_UNICODE_ALL) {
817 cl->flags |= ANYOF_UNICODE_ALL;
818 cl->flags &= ~ANYOF_UNICODE;
819 }
653099ff
GS
820}
821
a3621e74
YO
822#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
823#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
824#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
825#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
826
3dab1dad
YO
827
828#ifdef DEBUGGING
07be1b83 829/*
2b8b4781
NC
830 dump_trie(trie,widecharmap,revcharmap)
831 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
832 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
833
834 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
835 The _interim_ variants are used for debugging the interim
836 tables that are used to generate the final compressed
837 representation which is what dump_trie expects.
838
3dab1dad
YO
839 Part of the reason for their existance is to provide a form
840 of documentation as to how the different representations function.
07be1b83
YO
841
842*/
3dab1dad
YO
843
844/*
3dab1dad
YO
845 Dumps the final compressed table form of the trie to Perl_debug_log.
846 Used for debugging make_trie().
847*/
848
849STATIC void
2b8b4781
NC
850S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
851 AV *revcharmap, U32 depth)
3dab1dad
YO
852{
853 U32 state;
ab3bbdeb 854 SV *sv=sv_newmortal();
55eed653 855 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
856 GET_RE_DEBUG_FLAGS_DECL;
857
ab3bbdeb 858
3dab1dad
YO
859 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
860 (int)depth * 2 + 2,"",
861 "Match","Base","Ofs" );
862
863 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 864 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 865 if ( tmp ) {
ab3bbdeb
YO
866 PerlIO_printf( Perl_debug_log, "%*s",
867 colwidth,
ddc5bc0f 868 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
869 PL_colors[0], PL_colors[1],
870 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
871 PERL_PV_ESCAPE_FIRSTCHAR
872 )
873 );
3dab1dad
YO
874 }
875 }
876 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
877 (int)depth * 2 + 2,"");
878
879 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 880 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
881 PerlIO_printf( Perl_debug_log, "\n");
882
1e2e3d02 883 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 884 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
885
886 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
887
888 if ( trie->states[ state ].wordnum ) {
889 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
890 } else {
891 PerlIO_printf( Perl_debug_log, "%6s", "" );
892 }
893
894 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
895
896 if ( base ) {
897 U32 ofs = 0;
898
899 while( ( base + ofs < trie->uniquecharcount ) ||
900 ( base + ofs - trie->uniquecharcount < trie->lasttrans
901 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
902 ofs++;
903
904 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
905
906 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
907 if ( ( base + ofs >= trie->uniquecharcount ) &&
908 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
909 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
910 {
ab3bbdeb
YO
911 PerlIO_printf( Perl_debug_log, "%*"UVXf,
912 colwidth,
3dab1dad
YO
913 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
914 } else {
ab3bbdeb 915 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
916 }
917 }
918
919 PerlIO_printf( Perl_debug_log, "]");
920
921 }
922 PerlIO_printf( Perl_debug_log, "\n" );
923 }
924}
925/*
3dab1dad
YO
926 Dumps a fully constructed but uncompressed trie in list form.
927 List tries normally only are used for construction when the number of
928 possible chars (trie->uniquecharcount) is very high.
929 Used for debugging make_trie().
930*/
931STATIC void
55eed653 932S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
933 HV *widecharmap, AV *revcharmap, U32 next_alloc,
934 U32 depth)
3dab1dad
YO
935{
936 U32 state;
ab3bbdeb 937 SV *sv=sv_newmortal();
55eed653 938 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
939 GET_RE_DEBUG_FLAGS_DECL;
940 /* print out the table precompression. */
ab3bbdeb
YO
941 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
942 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
943 "------:-----+-----------------\n" );
3dab1dad
YO
944
945 for( state=1 ; state < next_alloc ; state ++ ) {
946 U16 charid;
947
ab3bbdeb 948 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
949 (int)depth * 2 + 2,"", (UV)state );
950 if ( ! trie->states[ state ].wordnum ) {
951 PerlIO_printf( Perl_debug_log, "%5s| ","");
952 } else {
953 PerlIO_printf( Perl_debug_log, "W%4x| ",
954 trie->states[ state ].wordnum
955 );
956 }
957 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 958 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
959 if ( tmp ) {
960 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
961 colwidth,
ddc5bc0f 962 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
963 PL_colors[0], PL_colors[1],
964 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
965 PERL_PV_ESCAPE_FIRSTCHAR
966 ) ,
1e2e3d02
YO
967 TRIE_LIST_ITEM(state,charid).forid,
968 (UV)TRIE_LIST_ITEM(state,charid).newstate
969 );
970 if (!(charid % 10))
664e119d
RGS
971 PerlIO_printf(Perl_debug_log, "\n%*s| ",
972 (int)((depth * 2) + 14), "");
1e2e3d02 973 }
ab3bbdeb
YO
974 }
975 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
976 }
977}
978
979/*
3dab1dad
YO
980 Dumps a fully constructed but uncompressed trie in table form.
981 This is the normal DFA style state transition table, with a few
982 twists to facilitate compression later.
983 Used for debugging make_trie().
984*/
985STATIC void
55eed653 986S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
987 HV *widecharmap, AV *revcharmap, U32 next_alloc,
988 U32 depth)
3dab1dad
YO
989{
990 U32 state;
991 U16 charid;
ab3bbdeb 992 SV *sv=sv_newmortal();
55eed653 993 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
994 GET_RE_DEBUG_FLAGS_DECL;
995
996 /*
997 print out the table precompression so that we can do a visual check
998 that they are identical.
999 */
1000
1001 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1002
1003 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1004 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1005 if ( tmp ) {
ab3bbdeb
YO
1006 PerlIO_printf( Perl_debug_log, "%*s",
1007 colwidth,
ddc5bc0f 1008 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1009 PL_colors[0], PL_colors[1],
1010 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1011 PERL_PV_ESCAPE_FIRSTCHAR
1012 )
1013 );
3dab1dad
YO
1014 }
1015 }
1016
1017 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1018
1019 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1020 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1021 }
1022
1023 PerlIO_printf( Perl_debug_log, "\n" );
1024
1025 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1026
1027 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1028 (int)depth * 2 + 2,"",
1029 (UV)TRIE_NODENUM( state ) );
1030
1031 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1032 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1033 if (v)
1034 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1035 else
1036 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1037 }
1038 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1040 } else {
1041 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1042 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1043 }
1044 }
07be1b83 1045}
3dab1dad
YO
1046
1047#endif
1048
786e8c11
YO
1049/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1050 startbranch: the first branch in the whole branch sequence
1051 first : start branch of sequence of branch-exact nodes.
1052 May be the same as startbranch
1053 last : Thing following the last branch.
1054 May be the same as tail.
1055 tail : item following the branch sequence
1056 count : words in the sequence
1057 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1058 depth : indent depth
3dab1dad 1059
786e8c11 1060Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1061
786e8c11
YO
1062A trie is an N'ary tree where the branches are determined by digital
1063decomposition of the key. IE, at the root node you look up the 1st character and
1064follow that branch repeat until you find the end of the branches. Nodes can be
1065marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1066
786e8c11 1067 /he|she|his|hers/
72f13be8 1068
786e8c11
YO
1069would convert into the following structure. Numbers represent states, letters
1070following numbers represent valid transitions on the letter from that state, if
1071the number is in square brackets it represents an accepting state, otherwise it
1072will be in parenthesis.
07be1b83 1073
786e8c11
YO
1074 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1075 | |
1076 | (2)
1077 | |
1078 (1) +-i->(6)-+-s->[7]
1079 |
1080 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1081
786e8c11
YO
1082 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1083
1084This shows that when matching against the string 'hers' we will begin at state 1
1085read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1086then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1087is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1088single traverse. We store a mapping from accepting to state to which word was
1089matched, and then when we have multiple possibilities we try to complete the
1090rest of the regex in the order in which they occured in the alternation.
1091
1092The only prior NFA like behaviour that would be changed by the TRIE support is
1093the silent ignoring of duplicate alternations which are of the form:
1094
1095 / (DUPE|DUPE) X? (?{ ... }) Y /x
1096
1097Thus EVAL blocks follwing a trie may be called a different number of times with
1098and without the optimisation. With the optimisations dupes will be silently
1099ignored. This inconsistant behaviour of EVAL type nodes is well established as
1100the following demonstrates:
1101
1102 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1103
1104which prints out 'word' three times, but
1105
1106 'words'=~/(word|word|word)(?{ print $1 })S/
1107
1108which doesnt print it out at all. This is due to other optimisations kicking in.
1109
1110Example of what happens on a structural level:
1111
1112The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1113
1114 1: CURLYM[1] {1,32767}(18)
1115 5: BRANCH(8)
1116 6: EXACT <ac>(16)
1117 8: BRANCH(11)
1118 9: EXACT <ad>(16)
1119 11: BRANCH(14)
1120 12: EXACT <ab>(16)
1121 16: SUCCEED(0)
1122 17: NOTHING(18)
1123 18: END(0)
1124
1125This would be optimizable with startbranch=5, first=5, last=16, tail=16
1126and should turn into:
1127
1128 1: CURLYM[1] {1,32767}(18)
1129 5: TRIE(16)
1130 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1131 <ac>
1132 <ad>
1133 <ab>
1134 16: SUCCEED(0)
1135 17: NOTHING(18)
1136 18: END(0)
1137
1138Cases where tail != last would be like /(?foo|bar)baz/:
1139
1140 1: BRANCH(4)
1141 2: EXACT <foo>(8)
1142 4: BRANCH(7)
1143 5: EXACT <bar>(8)
1144 7: TAIL(8)
1145 8: EXACT <baz>(10)
1146 10: END(0)
1147
1148which would be optimizable with startbranch=1, first=1, last=7, tail=8
1149and would end up looking like:
1150
1151 1: TRIE(8)
1152 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1153 <foo>
1154 <bar>
1155 7: TAIL(8)
1156 8: EXACT <baz>(10)
1157 10: END(0)
1158
1159 d = uvuni_to_utf8_flags(d, uv, 0);
1160
1161is the recommended Unicode-aware way of saying
1162
1163 *(d++) = uv;
1164*/
1165
1e2e3d02 1166#define TRIE_STORE_REVCHAR \
786e8c11 1167 STMT_START { \
73031816
NC
1168 if (UTF) { \
1169 SV *zlopp = newSV(2); \
88c9ea1e
CB
1170 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1171 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1172 SvCUR_set(zlopp, kapow - flrbbbbb); \
1173 SvPOK_on(zlopp); \
1174 SvUTF8_on(zlopp); \
1175 av_push(revcharmap, zlopp); \
1176 } else { \
6bdeddd2 1177 char ooooff = (char)uvc; \
73031816
NC
1178 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1179 } \
1180 } STMT_END
786e8c11
YO
1181
1182#define TRIE_READ_CHAR STMT_START { \
1183 wordlen++; \
1184 if ( UTF ) { \
1185 if ( folder ) { \
1186 if ( foldlen > 0 ) { \
1187 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1188 foldlen -= len; \
1189 scan += len; \
1190 len = 0; \
1191 } else { \
1192 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1193 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1194 foldlen -= UNISKIP( uvc ); \
1195 scan = foldbuf + UNISKIP( uvc ); \
1196 } \
1197 } else { \
1198 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1199 } \
1200 } else { \
1201 uvc = (U32)*uc; \
1202 len = 1; \
1203 } \
1204} STMT_END
1205
1206
1207
1208#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1209 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1210 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1211 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1212 } \
1213 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1214 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1215 TRIE_LIST_CUR( state )++; \
1216} STMT_END
07be1b83 1217
786e8c11
YO
1218#define TRIE_LIST_NEW(state) STMT_START { \
1219 Newxz( trie->states[ state ].trans.list, \
1220 4, reg_trie_trans_le ); \
1221 TRIE_LIST_CUR( state ) = 1; \
1222 TRIE_LIST_LEN( state ) = 4; \
1223} STMT_END
07be1b83 1224
786e8c11
YO
1225#define TRIE_HANDLE_WORD(state) STMT_START { \
1226 U16 dupe= trie->states[ state ].wordnum; \
1227 regnode * const noper_next = regnext( noper ); \
1228 \
1229 if (trie->wordlen) \
1230 trie->wordlen[ curword ] = wordlen; \
1231 DEBUG_r({ \
1232 /* store the word for dumping */ \
1233 SV* tmp; \
1234 if (OP(noper) != NOTHING) \
740cce10 1235 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1236 else \
740cce10 1237 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1238 av_push( trie_words, tmp ); \
786e8c11
YO
1239 }); \
1240 \
1241 curword++; \
1242 \
1243 if ( noper_next < tail ) { \
1244 if (!trie->jump) \
c944940b 1245 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1246 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1247 if (!jumper) \
1248 jumper = noper_next; \
1249 if (!nextbranch) \
1250 nextbranch= regnext(cur); \
1251 } \
1252 \
1253 if ( dupe ) { \
1254 /* So it's a dupe. This means we need to maintain a */\
1255 /* linked-list from the first to the next. */\
1256 /* we only allocate the nextword buffer when there */\
1257 /* a dupe, so first time we have to do the allocation */\
1258 if (!trie->nextword) \
c944940b 1259 trie->nextword = (U16 *) \
446bd890 1260 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1261 while ( trie->nextword[dupe] ) \
1262 dupe= trie->nextword[dupe]; \
1263 trie->nextword[dupe]= curword; \
1264 } else { \
1265 /* we haven't inserted this word yet. */ \
1266 trie->states[ state ].wordnum = curword; \
1267 } \
1268} STMT_END
07be1b83 1269
3dab1dad 1270
786e8c11
YO
1271#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1272 ( ( base + charid >= ucharcount \
1273 && base + charid < ubound \
1274 && state == trie->trans[ base - ucharcount + charid ].check \
1275 && trie->trans[ base - ucharcount + charid ].next ) \
1276 ? trie->trans[ base - ucharcount + charid ].next \
1277 : ( state==1 ? special : 0 ) \
1278 )
3dab1dad 1279
786e8c11
YO
1280#define MADE_TRIE 1
1281#define MADE_JUMP_TRIE 2
1282#define MADE_EXACT_TRIE 4
3dab1dad 1283
a3621e74 1284STATIC I32
786e8c11 1285S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1286{
27da23d5 1287 dVAR;
a3621e74
YO
1288 /* first pass, loop through and scan words */
1289 reg_trie_data *trie;
55eed653 1290 HV *widecharmap = NULL;
2b8b4781 1291 AV *revcharmap = newAV();
a3621e74 1292 regnode *cur;
9f7f3913 1293 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1294 STRLEN len = 0;
1295 UV uvc = 0;
1296 U16 curword = 0;
1297 U32 next_alloc = 0;
786e8c11
YO
1298 regnode *jumper = NULL;
1299 regnode *nextbranch = NULL;
7f69552c 1300 regnode *convert = NULL;
a3621e74 1301 /* we just use folder as a flag in utf8 */
e1ec3a88 1302 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1303 ? PL_fold
1304 : ( flags == EXACTFL
1305 ? PL_fold_locale
1306 : NULL
1307 )
1308 );
1309
2b8b4781
NC
1310#ifdef DEBUGGING
1311 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1312 AV *trie_words = NULL;
1313 /* along with revcharmap, this only used during construction but both are
1314 * useful during debugging so we store them in the struct when debugging.
8e11feef 1315 */
2b8b4781
NC
1316#else
1317 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1318 STRLEN trie_charcount=0;
3dab1dad 1319#endif
2b8b4781 1320 SV *re_trie_maxbuff;
a3621e74 1321 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1322#ifndef DEBUGGING
1323 PERL_UNUSED_ARG(depth);
1324#endif
a3621e74 1325
c944940b 1326 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1327 trie->refcount = 1;
3dab1dad 1328 trie->startstate = 1;
786e8c11 1329 trie->wordcount = word_count;
f8fc2ecf 1330 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1331 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1332 if (!(UTF && folder))
c944940b 1333 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1334 DEBUG_r({
2b8b4781 1335 trie_words = newAV();
a3621e74 1336 });
a3621e74 1337
0111c4fd 1338 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1339 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1340 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1341 }
3dab1dad
YO
1342 DEBUG_OPTIMISE_r({
1343 PerlIO_printf( Perl_debug_log,
786e8c11 1344 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1345 (int)depth * 2 + 2, "",
1346 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1347 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1348 (int)depth);
3dab1dad 1349 });
7f69552c
YO
1350
1351 /* Find the node we are going to overwrite */
1352 if ( first == startbranch && OP( last ) != BRANCH ) {
1353 /* whole branch chain */
1354 convert = first;
1355 } else {
1356 /* branch sub-chain */
1357 convert = NEXTOPER( first );
1358 }
1359
a3621e74
YO
1360 /* -- First loop and Setup --
1361
1362 We first traverse the branches and scan each word to determine if it
1363 contains widechars, and how many unique chars there are, this is
1364 important as we have to build a table with at least as many columns as we
1365 have unique chars.
1366
1367 We use an array of integers to represent the character codes 0..255
38a44b82 1368 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1369 native representation of the character value as the key and IV's for the
1370 coded index.
1371
1372 *TODO* If we keep track of how many times each character is used we can
1373 remap the columns so that the table compression later on is more
1374 efficient in terms of memory by ensuring most common value is in the
1375 middle and the least common are on the outside. IMO this would be better
1376 than a most to least common mapping as theres a decent chance the most
1377 common letter will share a node with the least common, meaning the node
1378 will not be compressable. With a middle is most common approach the worst
1379 case is when we have the least common nodes twice.
1380
1381 */
1382
a3621e74 1383 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1384 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1385 const U8 *uc = (U8*)STRING( noper );
a28509cc 1386 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1387 STRLEN foldlen = 0;
1388 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1389 const U8 *scan = (U8*)NULL;
07be1b83 1390 U32 wordlen = 0; /* required init */
02daf0ab
YO
1391 STRLEN chars = 0;
1392 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1393
3dab1dad
YO
1394 if (OP(noper) == NOTHING) {
1395 trie->minlen= 0;
1396 continue;
1397 }
02daf0ab
YO
1398 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1399 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1400 regardless of encoding */
1401
a3621e74 1402 for ( ; uc < e ; uc += len ) {
3dab1dad 1403 TRIE_CHARCOUNT(trie)++;
a3621e74 1404 TRIE_READ_CHAR;
3dab1dad 1405 chars++;
a3621e74
YO
1406 if ( uvc < 256 ) {
1407 if ( !trie->charmap[ uvc ] ) {
1408 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1409 if ( folder )
1410 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1411 TRIE_STORE_REVCHAR;
a3621e74 1412 }
02daf0ab
YO
1413 if ( set_bit ) {
1414 /* store the codepoint in the bitmap, and if its ascii
1415 also store its folded equivelent. */
1416 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1417
1418 /* store the folded codepoint */
1419 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1420
1421 if ( !UTF ) {
1422 /* store first byte of utf8 representation of
1423 codepoints in the 127 < uvc < 256 range */
1424 if (127 < uvc && uvc < 192) {
1425 TRIE_BITMAP_SET(trie,194);
1426 } else if (191 < uvc ) {
1427 TRIE_BITMAP_SET(trie,195);
1428 /* && uvc < 256 -- we know uvc is < 256 already */
1429 }
1430 }
02daf0ab
YO
1431 set_bit = 0; /* We've done our bit :-) */
1432 }
a3621e74
YO
1433 } else {
1434 SV** svpp;
55eed653
NC
1435 if ( !widecharmap )
1436 widecharmap = newHV();
a3621e74 1437
55eed653 1438 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1439
1440 if ( !svpp )
e4584336 1441 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1442
1443 if ( !SvTRUE( *svpp ) ) {
1444 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1445 TRIE_STORE_REVCHAR;
a3621e74
YO
1446 }
1447 }
1448 }
3dab1dad
YO
1449 if( cur == first ) {
1450 trie->minlen=chars;
1451 trie->maxlen=chars;
1452 } else if (chars < trie->minlen) {
1453 trie->minlen=chars;
1454 } else if (chars > trie->maxlen) {
1455 trie->maxlen=chars;
1456 }
1457
a3621e74
YO
1458 } /* end first pass */
1459 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1460 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1461 (int)depth * 2 + 2,"",
55eed653 1462 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1463 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1464 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1465 );
c944940b 1466 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1467
1468 /*
1469 We now know what we are dealing with in terms of unique chars and
1470 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1471 representation using a flat table will take. If it's over a reasonable
1472 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1473 conservative but potentially much slower representation using an array
1474 of lists.
1475
1476 At the end we convert both representations into the same compressed
1477 form that will be used in regexec.c for matching with. The latter
1478 is a form that cannot be used to construct with but has memory
1479 properties similar to the list form and access properties similar
1480 to the table form making it both suitable for fast searches and
1481 small enough that its feasable to store for the duration of a program.
1482
1483 See the comment in the code where the compressed table is produced
1484 inplace from the flat tabe representation for an explanation of how
1485 the compression works.
1486
1487 */
1488
1489
3dab1dad 1490 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1491 /*
1492 Second Pass -- Array Of Lists Representation
1493
1494 Each state will be represented by a list of charid:state records
1495 (reg_trie_trans_le) the first such element holds the CUR and LEN
1496 points of the allocated array. (See defines above).
1497
1498 We build the initial structure using the lists, and then convert
1499 it into the compressed table form which allows faster lookups
1500 (but cant be modified once converted).
a3621e74
YO
1501 */
1502
a3621e74
YO
1503 STRLEN transcount = 1;
1504
1e2e3d02
YO
1505 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1506 "%*sCompiling trie using list compiler\n",
1507 (int)depth * 2 + 2, ""));
446bd890 1508
c944940b
JH
1509 trie->states = (reg_trie_state *)
1510 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1511 sizeof(reg_trie_state) );
a3621e74
YO
1512 TRIE_LIST_NEW(1);
1513 next_alloc = 2;
1514
1515 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1516
c445ea15
AL
1517 regnode * const noper = NEXTOPER( cur );
1518 U8 *uc = (U8*)STRING( noper );
1519 const U8 * const e = uc + STR_LEN( noper );
1520 U32 state = 1; /* required init */
1521 U16 charid = 0; /* sanity init */
1522 U8 *scan = (U8*)NULL; /* sanity init */
1523 STRLEN foldlen = 0; /* required init */
07be1b83 1524 U32 wordlen = 0; /* required init */
c445ea15
AL
1525 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1526
3dab1dad 1527 if (OP(noper) != NOTHING) {
786e8c11 1528 for ( ; uc < e ; uc += len ) {
c445ea15 1529
786e8c11 1530 TRIE_READ_CHAR;
c445ea15 1531
786e8c11
YO
1532 if ( uvc < 256 ) {
1533 charid = trie->charmap[ uvc ];
c445ea15 1534 } else {
55eed653 1535 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1536 if ( !svpp ) {
1537 charid = 0;
1538 } else {
1539 charid=(U16)SvIV( *svpp );
1540 }
c445ea15 1541 }
786e8c11
YO
1542 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1543 if ( charid ) {
a3621e74 1544
786e8c11
YO
1545 U16 check;
1546 U32 newstate = 0;
a3621e74 1547
786e8c11
YO
1548 charid--;
1549 if ( !trie->states[ state ].trans.list ) {
1550 TRIE_LIST_NEW( state );
c445ea15 1551 }
786e8c11
YO
1552 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1553 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1554 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1555 break;
1556 }
1557 }
1558 if ( ! newstate ) {
1559 newstate = next_alloc++;
1560 TRIE_LIST_PUSH( state, charid, newstate );
1561 transcount++;
1562 }
1563 state = newstate;
1564 } else {
1565 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1566 }
a28509cc 1567 }
c445ea15 1568 }
3dab1dad 1569 TRIE_HANDLE_WORD(state);
a3621e74
YO
1570
1571 } /* end second pass */
1572
1e2e3d02
YO
1573 /* next alloc is the NEXT state to be allocated */
1574 trie->statecount = next_alloc;
c944940b
JH
1575 trie->states = (reg_trie_state *)
1576 PerlMemShared_realloc( trie->states,
1577 next_alloc
1578 * sizeof(reg_trie_state) );
a3621e74 1579
3dab1dad 1580 /* and now dump it out before we compress it */
2b8b4781
NC
1581 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1582 revcharmap, next_alloc,
1583 depth+1)
1e2e3d02 1584 );
a3621e74 1585
c944940b
JH
1586 trie->trans = (reg_trie_trans *)
1587 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1588 {
1589 U32 state;
a3621e74
YO
1590 U32 tp = 0;
1591 U32 zp = 0;
1592
1593
1594 for( state=1 ; state < next_alloc ; state ++ ) {
1595 U32 base=0;
1596
1597 /*
1598 DEBUG_TRIE_COMPILE_MORE_r(
1599 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1600 );
1601 */
1602
1603 if (trie->states[state].trans.list) {
1604 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1605 U16 maxid=minid;
a28509cc 1606 U16 idx;
a3621e74
YO
1607
1608 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1609 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1610 if ( forid < minid ) {
1611 minid=forid;
1612 } else if ( forid > maxid ) {
1613 maxid=forid;
1614 }
a3621e74
YO
1615 }
1616 if ( transcount < tp + maxid - minid + 1) {
1617 transcount *= 2;
c944940b
JH
1618 trie->trans = (reg_trie_trans *)
1619 PerlMemShared_realloc( trie->trans,
446bd890
NC
1620 transcount
1621 * sizeof(reg_trie_trans) );
a3621e74
YO
1622 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1623 }
1624 base = trie->uniquecharcount + tp - minid;
1625 if ( maxid == minid ) {
1626 U32 set = 0;
1627 for ( ; zp < tp ; zp++ ) {
1628 if ( ! trie->trans[ zp ].next ) {
1629 base = trie->uniquecharcount + zp - minid;
1630 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1631 trie->trans[ zp ].check = state;
1632 set = 1;
1633 break;
1634 }
1635 }
1636 if ( !set ) {
1637 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1638 trie->trans[ tp ].check = state;
1639 tp++;
1640 zp = tp;
1641 }
1642 } else {
1643 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1644 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1645 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1646 trie->trans[ tid ].check = state;
1647 }
1648 tp += ( maxid - minid + 1 );
1649 }
1650 Safefree(trie->states[ state ].trans.list);
1651 }
1652 /*
1653 DEBUG_TRIE_COMPILE_MORE_r(
1654 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1655 );
1656 */
1657 trie->states[ state ].trans.base=base;
1658 }
cc601c31 1659 trie->lasttrans = tp + 1;
a3621e74
YO
1660 }
1661 } else {
1662 /*
1663 Second Pass -- Flat Table Representation.
1664
1665 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1666 We know that we will need Charcount+1 trans at most to store the data
1667 (one row per char at worst case) So we preallocate both structures
1668 assuming worst case.
1669
1670 We then construct the trie using only the .next slots of the entry
1671 structs.
1672
1673 We use the .check field of the first entry of the node temporarily to
1674 make compression both faster and easier by keeping track of how many non
1675 zero fields are in the node.
1676
1677 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1678 transition.
1679
1680 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1681 number representing the first entry of the node, and state as a
1682 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1683 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1684 are 2 entrys per node. eg:
1685
1686 A B A B
1687 1. 2 4 1. 3 7
1688 2. 0 3 3. 0 5
1689 3. 0 0 5. 0 0
1690 4. 0 0 7. 0 0
1691
1692 The table is internally in the right hand, idx form. However as we also
1693 have to deal with the states array which is indexed by nodenum we have to
1694 use TRIE_NODENUM() to convert.
1695
1696 */
1e2e3d02
YO
1697 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1698 "%*sCompiling trie using table compiler\n",
1699 (int)depth * 2 + 2, ""));
3dab1dad 1700
c944940b
JH
1701 trie->trans = (reg_trie_trans *)
1702 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1703 * trie->uniquecharcount + 1,
1704 sizeof(reg_trie_trans) );
1705 trie->states = (reg_trie_state *)
1706 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1707 sizeof(reg_trie_state) );
a3621e74
YO
1708 next_alloc = trie->uniquecharcount + 1;
1709
3dab1dad 1710
a3621e74
YO
1711 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1712
c445ea15 1713 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1714 const U8 *uc = (U8*)STRING( noper );
1715 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1716
1717 U32 state = 1; /* required init */
1718
1719 U16 charid = 0; /* sanity init */
1720 U32 accept_state = 0; /* sanity init */
1721 U8 *scan = (U8*)NULL; /* sanity init */
1722
1723 STRLEN foldlen = 0; /* required init */
07be1b83 1724 U32 wordlen = 0; /* required init */
a3621e74
YO
1725 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726
3dab1dad 1727 if ( OP(noper) != NOTHING ) {
786e8c11 1728 for ( ; uc < e ; uc += len ) {
a3621e74 1729
786e8c11 1730 TRIE_READ_CHAR;
a3621e74 1731
786e8c11
YO
1732 if ( uvc < 256 ) {
1733 charid = trie->charmap[ uvc ];
1734 } else {
55eed653 1735 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1736 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1737 }
786e8c11
YO
1738 if ( charid ) {
1739 charid--;
1740 if ( !trie->trans[ state + charid ].next ) {
1741 trie->trans[ state + charid ].next = next_alloc;
1742 trie->trans[ state ].check++;
1743 next_alloc += trie->uniquecharcount;
1744 }
1745 state = trie->trans[ state + charid ].next;
1746 } else {
1747 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1748 }
1749 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1750 }
a3621e74 1751 }
3dab1dad
YO
1752 accept_state = TRIE_NODENUM( state );
1753 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1754
1755 } /* end second pass */
1756
3dab1dad 1757 /* and now dump it out before we compress it */
2b8b4781
NC
1758 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1759 revcharmap,
1760 next_alloc, depth+1));
a3621e74 1761
a3621e74
YO
1762 {
1763 /*
1764 * Inplace compress the table.*
1765
1766 For sparse data sets the table constructed by the trie algorithm will
1767 be mostly 0/FAIL transitions or to put it another way mostly empty.
1768 (Note that leaf nodes will not contain any transitions.)
1769
1770 This algorithm compresses the tables by eliminating most such
1771 transitions, at the cost of a modest bit of extra work during lookup:
1772
1773 - Each states[] entry contains a .base field which indicates the
1774 index in the state[] array wheres its transition data is stored.
1775
1776 - If .base is 0 there are no valid transitions from that node.
1777
1778 - If .base is nonzero then charid is added to it to find an entry in
1779 the trans array.
1780
1781 -If trans[states[state].base+charid].check!=state then the
1782 transition is taken to be a 0/Fail transition. Thus if there are fail
1783 transitions at the front of the node then the .base offset will point
1784 somewhere inside the previous nodes data (or maybe even into a node
1785 even earlier), but the .check field determines if the transition is
1786 valid.
1787
786e8c11 1788 XXX - wrong maybe?
a3621e74
YO
1789 The following process inplace converts the table to the compressed
1790 table: We first do not compress the root node 1,and mark its all its
1791 .check pointers as 1 and set its .base pointer as 1 as well. This
1792 allows to do a DFA construction from the compressed table later, and
1793 ensures that any .base pointers we calculate later are greater than
1794 0.
1795
1796 - We set 'pos' to indicate the first entry of the second node.
1797
1798 - We then iterate over the columns of the node, finding the first and
1799 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1800 and set the .check pointers accordingly, and advance pos
1801 appropriately and repreat for the next node. Note that when we copy
1802 the next pointers we have to convert them from the original
1803 NODEIDX form to NODENUM form as the former is not valid post
1804 compression.
1805
1806 - If a node has no transitions used we mark its base as 0 and do not
1807 advance the pos pointer.
1808
1809 - If a node only has one transition we use a second pointer into the
1810 structure to fill in allocated fail transitions from other states.
1811 This pointer is independent of the main pointer and scans forward
1812 looking for null transitions that are allocated to a state. When it
1813 finds one it writes the single transition into the "hole". If the
786e8c11 1814 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1815
1816 - Once compressed we can Renew/realloc the structures to release the
1817 excess space.
1818
1819 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1820 specifically Fig 3.47 and the associated pseudocode.
1821
1822 demq
1823 */
a3b680e6 1824 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1825 U32 state, charid;
a3621e74 1826 U32 pos = 0, zp=0;
1e2e3d02 1827 trie->statecount = laststate;
a3621e74
YO
1828
1829 for ( state = 1 ; state < laststate ; state++ ) {
1830 U8 flag = 0;
a28509cc
AL
1831 const U32 stateidx = TRIE_NODEIDX( state );
1832 const U32 o_used = trie->trans[ stateidx ].check;
1833 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1834 trie->trans[ stateidx ].check = 0;
1835
1836 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1837 if ( flag || trie->trans[ stateidx + charid ].next ) {
1838 if ( trie->trans[ stateidx + charid ].next ) {
1839 if (o_used == 1) {
1840 for ( ; zp < pos ; zp++ ) {
1841 if ( ! trie->trans[ zp ].next ) {
1842 break;
1843 }
1844 }
1845 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1846 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1847 trie->trans[ zp ].check = state;
1848 if ( ++zp > pos ) pos = zp;
1849 break;
1850 }
1851 used--;
1852 }
1853 if ( !flag ) {
1854 flag = 1;
1855 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1856 }
1857 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1858 trie->trans[ pos ].check = state;
1859 pos++;
1860 }
1861 }
1862 }
cc601c31 1863 trie->lasttrans = pos + 1;
c944940b
JH
1864 trie->states = (reg_trie_state *)
1865 PerlMemShared_realloc( trie->states, laststate
1866 * sizeof(reg_trie_state) );
a3621e74 1867 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1868 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1869 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1870 (int)depth * 2 + 2,"",
1871 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1872 (IV)next_alloc,
1873 (IV)pos,
a3621e74
YO
1874 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1875 );
1876
1877 } /* end table compress */
1878 }
1e2e3d02
YO
1879 DEBUG_TRIE_COMPILE_MORE_r(
1880 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1881 (int)depth * 2 + 2, "",
1882 (UV)trie->statecount,
1883 (UV)trie->lasttrans)
1884 );
cc601c31 1885 /* resize the trans array to remove unused space */
c944940b
JH
1886 trie->trans = (reg_trie_trans *)
1887 PerlMemShared_realloc( trie->trans, trie->lasttrans
1888 * sizeof(reg_trie_trans) );
a3621e74 1889
3dab1dad 1890 /* and now dump out the compressed format */
2b8b4781 1891 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1892
3dab1dad 1893 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1894 U8 nodetype =(U8)(flags & 0xFF);
1895 char *str=NULL;
786e8c11 1896
07be1b83 1897#ifdef DEBUGGING
e62cc96a 1898 regnode *optimize = NULL;
7122b237
YO
1899#ifdef RE_TRACK_PATTERN_OFFSETS
1900
b57a0404
JH
1901 U32 mjd_offset = 0;
1902 U32 mjd_nodelen = 0;
7122b237
YO
1903#endif /* RE_TRACK_PATTERN_OFFSETS */
1904#endif /* DEBUGGING */
a3621e74 1905 /*
3dab1dad
YO
1906 This means we convert either the first branch or the first Exact,
1907 depending on whether the thing following (in 'last') is a branch
1908 or not and whther first is the startbranch (ie is it a sub part of
1909 the alternation or is it the whole thing.)
1910 Assuming its a sub part we conver the EXACT otherwise we convert
1911 the whole branch sequence, including the first.
a3621e74 1912 */
3dab1dad 1913 /* Find the node we are going to overwrite */
7f69552c 1914 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1915 /* branch sub-chain */
3dab1dad 1916 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1917#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1918 DEBUG_r({
1919 mjd_offset= Node_Offset((convert));
1920 mjd_nodelen= Node_Length((convert));
1921 });
7122b237 1922#endif
7f69552c 1923 /* whole branch chain */
7122b237
YO
1924 }
1925#ifdef RE_TRACK_PATTERN_OFFSETS
1926 else {
7f69552c
YO
1927 DEBUG_r({
1928 const regnode *nop = NEXTOPER( convert );
1929 mjd_offset= Node_Offset((nop));
1930 mjd_nodelen= Node_Length((nop));
1931 });
07be1b83
YO
1932 }
1933 DEBUG_OPTIMISE_r(
1934 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1935 (int)depth * 2 + 2, "",
786e8c11 1936 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1937 );
7122b237 1938#endif
3dab1dad
YO
1939 /* But first we check to see if there is a common prefix we can
1940 split out as an EXACT and put in front of the TRIE node. */
1941 trie->startstate= 1;
55eed653 1942 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1943 U32 state;
1e2e3d02 1944 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1945 U32 ofs = 0;
8e11feef
RGS
1946 I32 idx = -1;
1947 U32 count = 0;
1948 const U32 base = trie->states[ state ].trans.base;
a3621e74 1949
3dab1dad 1950 if ( trie->states[state].wordnum )
8e11feef 1951 count = 1;
a3621e74 1952
8e11feef 1953 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1954 if ( ( base + ofs >= trie->uniquecharcount ) &&
1955 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1956 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1957 {
3dab1dad 1958 if ( ++count > 1 ) {
2b8b4781 1959 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1960 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1961 if ( state == 1 ) break;
3dab1dad
YO
1962 if ( count == 2 ) {
1963 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1964 DEBUG_OPTIMISE_r(
8e11feef
RGS
1965 PerlIO_printf(Perl_debug_log,
1966 "%*sNew Start State=%"UVuf" Class: [",
1967 (int)depth * 2 + 2, "",
786e8c11 1968 (UV)state));
be8e71aa 1969 if (idx >= 0) {
2b8b4781 1970 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1971 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1972
3dab1dad 1973 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1974 if ( folder )
1975 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1976 DEBUG_OPTIMISE_r(
07be1b83 1977 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1978 );
8e11feef
RGS
1979 }
1980 }
1981 TRIE_BITMAP_SET(trie,*ch);
1982 if ( folder )
1983 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1984 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1985 }
1986 idx = ofs;
1987 }
3dab1dad
YO
1988 }
1989 if ( count == 1 ) {
2b8b4781 1990 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
1991 STRLEN len;
1992 char *ch = SvPV( *tmp, len );
de734bd5
A
1993 DEBUG_OPTIMISE_r({
1994 SV *sv=sv_newmortal();
8e11feef
RGS
1995 PerlIO_printf( Perl_debug_log,
1996 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1997 (int)depth * 2 + 2, "",
de734bd5
A
1998 (UV)state, (UV)idx,
1999 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2000 PL_colors[0], PL_colors[1],
2001 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2002 PERL_PV_ESCAPE_FIRSTCHAR
2003 )
2004 );
2005 });
3dab1dad
YO
2006 if ( state==1 ) {
2007 OP( convert ) = nodetype;
2008 str=STRING(convert);
2009 STR_LEN(convert)=0;
2010 }
c490c714
YO
2011 STR_LEN(convert) += len;
2012 while (len--)
de734bd5 2013 *str++ = *ch++;
8e11feef 2014 } else {
f9049ba1 2015#ifdef DEBUGGING
8e11feef
RGS
2016 if (state>1)
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2018#endif
8e11feef
RGS
2019 break;
2020 }
2021 }
3dab1dad 2022 if (str) {
8e11feef 2023 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2024 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2025 trie->startstate = state;
07be1b83
YO
2026 trie->minlen -= (state - 1);
2027 trie->maxlen -= (state - 1);
33809eae
JH
2028#ifdef DEBUGGING
2029 /* At least the UNICOS C compiler choked on this
2030 * being argument to DEBUG_r(), so let's just have
2031 * it right here. */
2032 if (
2033#ifdef PERL_EXT_RE_BUILD
2034 1
2035#else
2036 DEBUG_r_TEST
2037#endif
2038 ) {
2039 regnode *fix = convert;
2040 U32 word = trie->wordcount;
2041 mjd_nodelen++;
2042 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2043 while( ++fix < n ) {
2044 Set_Node_Offset_Length(fix, 0, 0);
2045 }
2046 while (word--) {
2047 SV ** const tmp = av_fetch( trie_words, word, 0 );
2048 if (tmp) {
2049 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2050 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2051 else
2052 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2053 }
2054 }
2055 }
2056#endif
8e11feef
RGS
2057 if (trie->maxlen) {
2058 convert = n;
2059 } else {
3dab1dad 2060 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2061 DEBUG_r(optimize= n);
3dab1dad
YO
2062 }
2063 }
2064 }
a5ca303d
YO
2065 if (!jumper)
2066 jumper = last;
3dab1dad 2067 if ( trie->maxlen ) {
8e11feef
RGS
2068 NEXT_OFF( convert ) = (U16)(tail - convert);
2069 ARG_SET( convert, data_slot );
786e8c11
YO
2070 /* Store the offset to the first unabsorbed branch in
2071 jump[0], which is otherwise unused by the jump logic.
2072 We use this when dumping a trie and during optimisation. */
2073 if (trie->jump)
7f69552c 2074 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2075
786e8c11
YO
2076 /* XXXX */
2077 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2078 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2079 {
2080 OP( convert ) = TRIEC;
2081 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2082 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2083 trie->bitmap= NULL;
2084 } else
2085 OP( convert ) = TRIE;
a3621e74 2086
3dab1dad
YO
2087 /* store the type in the flags */
2088 convert->flags = nodetype;
a5ca303d
YO
2089 DEBUG_r({
2090 optimize = convert
2091 + NODE_STEP_REGNODE
2092 + regarglen[ OP( convert ) ];
2093 });
2094 /* XXX We really should free up the resource in trie now,
2095 as we won't use them - (which resources?) dmq */
3dab1dad 2096 }
a3621e74 2097 /* needed for dumping*/
e62cc96a 2098 DEBUG_r(if (optimize) {
07be1b83 2099 regnode *opt = convert;
bcdf7404 2100
e62cc96a 2101 while ( ++opt < optimize) {
07be1b83
YO
2102 Set_Node_Offset_Length(opt,0,0);
2103 }
786e8c11
YO
2104 /*
2105 Try to clean up some of the debris left after the
2106 optimisation.
a3621e74 2107 */
786e8c11 2108 while( optimize < jumper ) {
07be1b83 2109 mjd_nodelen += Node_Length((optimize));
a3621e74 2110 OP( optimize ) = OPTIMIZED;
07be1b83 2111 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2112 optimize++;
2113 }
07be1b83 2114 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2115 });
2116 } /* end node insert */
55eed653 2117 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2118#ifdef DEBUGGING
2119 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2120 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2121#else
2122 SvREFCNT_dec(revcharmap);
07be1b83 2123#endif
786e8c11
YO
2124 return trie->jump
2125 ? MADE_JUMP_TRIE
2126 : trie->startstate>1
2127 ? MADE_EXACT_TRIE
2128 : MADE_TRIE;
2129}
2130
2131STATIC void
2132S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2133{
2134/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2135
2136 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2137 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2138 ISBN 0-201-10088-6
2139
2140 We find the fail state for each state in the trie, this state is the longest proper
2141 suffix of the current states 'word' that is also a proper prefix of another word in our
2142 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2143 the DFA not to have to restart after its tried and failed a word at a given point, it
2144 simply continues as though it had been matching the other word in the first place.
2145 Consider
2146 'abcdgu'=~/abcdefg|cdgu/
2147 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2148 fail, which would bring use to the state representing 'd' in the second word where we would
2149 try 'g' and succeed, prodceding to match 'cdgu'.
2150 */
2151 /* add a fail transition */
3251b653
NC
2152 const U32 trie_offset = ARG(source);
2153 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2154 U32 *q;
2155 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2156 const U32 numstates = trie->statecount;
786e8c11
YO
2157 const U32 ubound = trie->lasttrans + ucharcount;
2158 U32 q_read = 0;
2159 U32 q_write = 0;
2160 U32 charid;
2161 U32 base = trie->states[ 1 ].trans.base;
2162 U32 *fail;
2163 reg_ac_data *aho;
2164 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2165 GET_RE_DEBUG_FLAGS_DECL;
2166#ifndef DEBUGGING
2167 PERL_UNUSED_ARG(depth);
2168#endif
2169
2170
2171 ARG_SET( stclass, data_slot );
c944940b 2172 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2173 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2174 aho->trie=trie_offset;
446bd890
NC
2175 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2176 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2177 Newxz( q, numstates, U32);
c944940b 2178 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2179 aho->refcount = 1;
2180 fail = aho->fail;
2181 /* initialize fail[0..1] to be 1 so that we always have
2182 a valid final fail state */
2183 fail[ 0 ] = fail[ 1 ] = 1;
2184
2185 for ( charid = 0; charid < ucharcount ; charid++ ) {
2186 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2187 if ( newstate ) {
2188 q[ q_write ] = newstate;
2189 /* set to point at the root */
2190 fail[ q[ q_write++ ] ]=1;
2191 }
2192 }
2193 while ( q_read < q_write) {
2194 const U32 cur = q[ q_read++ % numstates ];
2195 base = trie->states[ cur ].trans.base;
2196
2197 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2198 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2199 if (ch_state) {
2200 U32 fail_state = cur;
2201 U32 fail_base;
2202 do {
2203 fail_state = fail[ fail_state ];
2204 fail_base = aho->states[ fail_state ].trans.base;
2205 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2206
2207 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2208 fail[ ch_state ] = fail_state;
2209 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2210 {
2211 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2212 }
2213 q[ q_write++ % numstates] = ch_state;
2214 }
2215 }
2216 }
2217 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2218 when we fail in state 1, this allows us to use the
2219 charclass scan to find a valid start char. This is based on the principle
2220 that theres a good chance the string being searched contains lots of stuff
2221 that cant be a start char.
2222 */
2223 fail[ 0 ] = fail[ 1 ] = 0;
2224 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2225 PerlIO_printf(Perl_debug_log,
2226 "%*sStclass Failtable (%"UVuf" states): 0",
2227 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2228 );
786e8c11
YO
2229 for( q_read=1; q_read<numstates; q_read++ ) {
2230 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2231 }
2232 PerlIO_printf(Perl_debug_log, "\n");
2233 });
2234 Safefree(q);
2235 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2236}
2237
786e8c11 2238
a3621e74 2239/*
5d1c421c
JH
2240 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2241 * These need to be revisited when a newer toolchain becomes available.
2242 */
2243#if defined(__sparc64__) && defined(__GNUC__)
2244# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2245# undef SPARC64_GCC_WORKAROUND
2246# define SPARC64_GCC_WORKAROUND 1
2247# endif
2248#endif
2249
07be1b83 2250#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2251 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2252 SV * const mysv=sv_newmortal(); \
2253 regnode *Next = regnext(scan); \
2254 regprop(RExC_rx, mysv, scan); \
7f69552c 2255 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2256 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2257 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2258 }});
07be1b83 2259
1de06328
YO
2260
2261
2262
2263
07be1b83
YO
2264#define JOIN_EXACT(scan,min,flags) \
2265 if (PL_regkind[OP(scan)] == EXACT) \
2266 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2267
be8e71aa 2268STATIC U32
07be1b83
YO
2269S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2270 /* Merge several consecutive EXACTish nodes into one. */
2271 regnode *n = regnext(scan);
2272 U32 stringok = 1;
2273 regnode *next = scan + NODE_SZ_STR(scan);
2274 U32 merged = 0;
2275 U32 stopnow = 0;
2276#ifdef DEBUGGING
2277 regnode *stop = scan;
72f13be8 2278 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2279#else
d47053eb
RGS
2280 PERL_UNUSED_ARG(depth);
2281#endif
2282#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2283 PERL_UNUSED_ARG(flags);
2284 PERL_UNUSED_ARG(val);
07be1b83 2285#endif
07be1b83
YO
2286 DEBUG_PEEP("join",scan,depth);
2287
2288 /* Skip NOTHING, merge EXACT*. */
2289 while (n &&
2290 ( PL_regkind[OP(n)] == NOTHING ||
2291 (stringok && (OP(n) == OP(scan))))
2292 && NEXT_OFF(n)
2293 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2294
2295 if (OP(n) == TAIL || n > next)
2296 stringok = 0;
2297 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2298 DEBUG_PEEP("skip:",n,depth);
2299 NEXT_OFF(scan) += NEXT_OFF(n);
2300 next = n + NODE_STEP_REGNODE;
2301#ifdef DEBUGGING
2302 if (stringok)
2303 stop = n;
2304#endif
2305 n = regnext(n);
2306 }
2307 else if (stringok) {
786e8c11 2308 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2309 regnode * const nnext = regnext(n);
2310
2311 DEBUG_PEEP("merg",n,depth);
2312
2313 merged++;
2314 if (oldl + STR_LEN(n) > U8_MAX)
2315 break;
2316 NEXT_OFF(scan) += NEXT_OFF(n);
2317 STR_LEN(scan) += STR_LEN(n);
2318 next = n + NODE_SZ_STR(n);
2319 /* Now we can overwrite *n : */
2320 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2321#ifdef DEBUGGING
2322 stop = next - 1;
2323#endif
2324 n = nnext;
2325 if (stopnow) break;
2326 }
2327
d47053eb
RGS
2328#ifdef EXPERIMENTAL_INPLACESCAN
2329 if (flags && !NEXT_OFF(n)) {
2330 DEBUG_PEEP("atch", val, depth);
2331 if (reg_off_by_arg[OP(n)]) {
2332 ARG_SET(n, val - n);
2333 }
2334 else {
2335 NEXT_OFF(n) = val - n;
2336 }
2337 stopnow = 1;
2338 }
07be1b83
YO
2339#endif
2340 }
2341
2342 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2343 /*
2344 Two problematic code points in Unicode casefolding of EXACT nodes:
2345
2346 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2347 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2348
2349 which casefold to
2350
2351 Unicode UTF-8
2352
2353 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2354 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2355
2356 This means that in case-insensitive matching (or "loose matching",
2357 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2358 length of the above casefolded versions) can match a target string
2359 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2360 This would rather mess up the minimum length computation.
2361
2362 What we'll do is to look for the tail four bytes, and then peek
2363 at the preceding two bytes to see whether we need to decrease
2364 the minimum length by four (six minus two).
2365
2366 Thanks to the design of UTF-8, there cannot be false matches:
2367 A sequence of valid UTF-8 bytes cannot be a subsequence of
2368 another valid sequence of UTF-8 bytes.
2369
2370 */
2371 char * const s0 = STRING(scan), *s, *t;
2372 char * const s1 = s0 + STR_LEN(scan) - 1;
2373 char * const s2 = s1 - 4;
e294cc5d
JH
2374#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2375 const char t0[] = "\xaf\x49\xaf\x42";
2376#else
07be1b83 2377 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2378#endif
07be1b83
YO
2379 const char * const t1 = t0 + 3;
2380
2381 for (s = s0 + 2;
2382 s < s2 && (t = ninstr(s, s1, t0, t1));
2383 s = t + 4) {
e294cc5d
JH
2384#ifdef EBCDIC
2385 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2386 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2387#else
07be1b83
YO
2388 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2389 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2390#endif
07be1b83
YO
2391 *min -= 4;
2392 }
2393 }
2394
2395#ifdef DEBUGGING
2396 /* Allow dumping */
2397 n = scan + NODE_SZ_STR(scan);
2398 while (n <= stop) {
2399 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2400 OP(n) = OPTIMIZED;
2401 NEXT_OFF(n) = 0;
2402 }
2403 n++;
2404 }
2405#endif
2406 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2407 return stopnow;
2408}
2409
653099ff
GS
2410/* REx optimizer. Converts nodes into quickier variants "in place".
2411 Finds fixed substrings. */
2412
a0288114 2413/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2414 to the position after last scanned or to NULL. */
2415
40d049e4
YO
2416#define INIT_AND_WITHP \
2417 assert(!and_withp); \
2418 Newx(and_withp,1,struct regnode_charclass_class); \
2419 SAVEFREEPV(and_withp)
07be1b83 2420
b515a41d
YO
2421/* this is a chain of data about sub patterns we are processing that
2422 need to be handled seperately/specially in study_chunk. Its so
2423 we can simulate recursion without losing state. */
2424struct scan_frame;
2425typedef struct scan_frame {
2426 regnode *last; /* last node to process in this frame */
2427 regnode *next; /* next node to process when last is reached */
2428 struct scan_frame *prev; /*previous frame*/
2429 I32 stop; /* what stopparen do we use */
2430} scan_frame;
2431
304ee84b
YO
2432
2433#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2434
e1d1eefb
YO
2435#define CASE_SYNST_FNC(nAmE) \
2436case nAmE: \
2437 if (flags & SCF_DO_STCLASS_AND) { \
2438 for (value = 0; value < 256; value++) \
2439 if (!is_ ## nAmE ## _cp(value)) \
2440 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2441 } \
2442 else { \
2443 for (value = 0; value < 256; value++) \
2444 if (is_ ## nAmE ## _cp(value)) \
2445 ANYOF_BITMAP_SET(data->start_class, value); \
2446 } \
2447 break; \
2448case N ## nAmE: \
2449 if (flags & SCF_DO_STCLASS_AND) { \
2450 for (value = 0; value < 256; value++) \
2451 if (is_ ## nAmE ## _cp(value)) \
2452 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2453 } \
2454 else { \
2455 for (value = 0; value < 256; value++) \
2456 if (!is_ ## nAmE ## _cp(value)) \
2457 ANYOF_BITMAP_SET(data->start_class, value); \
2458 } \
2459 break
2460
2461
2462
76e3520e 2463STATIC I32
40d049e4 2464S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2465 I32 *minlenp, I32 *deltap,
40d049e4
YO
2466 regnode *last,
2467 scan_data_t *data,
2468 I32 stopparen,
2469 U8* recursed,
2470 struct regnode_charclass_class *and_withp,
2471 U32 flags, U32 depth)
c277df42
IZ
2472 /* scanp: Start here (read-write). */
2473 /* deltap: Write maxlen-minlen here. */
2474 /* last: Stop before this one. */
40d049e4
YO
2475 /* data: string data about the pattern */
2476 /* stopparen: treat close N as END */
2477 /* recursed: which subroutines have we recursed into */
2478 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2479{
97aff369 2480 dVAR;
c277df42
IZ
2481 I32 min = 0, pars = 0, code;
2482 regnode *scan = *scanp, *next;
2483 I32 delta = 0;
2484 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2485 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2486 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2487 scan_data_t data_fake;
a3621e74 2488 SV *re_trie_maxbuff = NULL;
786e8c11 2489 regnode *first_non_open = scan;
e2e6a0f1 2490 I32 stopmin = I32_MAX;
8aa23a47
YO
2491 scan_frame *frame = NULL;
2492
a3621e74 2493 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2494
13a24bad 2495#ifdef DEBUGGING
40d049e4 2496 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2497#endif
40d049e4 2498
786e8c11 2499 if ( depth == 0 ) {
40d049e4 2500 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2501 first_non_open=regnext(first_non_open);
2502 }
2503
b81d288d 2504
8aa23a47
YO
2505 fake_study_recurse:
2506 while ( scan && OP(scan) != END && scan < last ){
2507 /* Peephole optimizer: */
304ee84b 2508 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2509 DEBUG_PEEP("Peep",scan,depth);
2510 JOIN_EXACT(scan,&min,0);
2511
2512 /* Follow the next-chain of the current node and optimize
2513 away all the NOTHINGs from it. */
2514 if (OP(scan) != CURLYX) {
2515 const int max = (reg_off_by_arg[OP(scan)]
2516 ? I32_MAX
2517 /* I32 may be smaller than U16 on CRAYs! */
2518 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2519 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2520 int noff;
2521 regnode *n = scan;
2522
2523 /* Skip NOTHING and LONGJMP. */
2524 while ((n = regnext(n))
2525 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2526 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2527 && off + noff < max)
2528 off += noff;
2529 if (reg_off_by_arg[OP(scan)])
2530 ARG(scan) = off;
2531 else
2532 NEXT_OFF(scan) = off;
2533 }
a3621e74 2534
c277df42 2535
8aa23a47
YO
2536
2537 /* The principal pseudo-switch. Cannot be a switch, since we
2538 look into several different things. */
2539 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2540 || OP(scan) == IFTHEN) {
2541 next = regnext(scan);
2542 code = OP(scan);
2543 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2544
2545 if (OP(next) == code || code == IFTHEN) {
2546 /* NOTE - There is similar code to this block below for handling
2547 TRIE nodes on a re-study. If you change stuff here check there
2548 too. */
2549 I32 max1 = 0, min1 = I32_MAX, num = 0;
2550 struct regnode_charclass_class accum;
2551 regnode * const startbranch=scan;
2552
2553 if (flags & SCF_DO_SUBSTR)
304ee84b 2554 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2555 if (flags & SCF_DO_STCLASS)
2556 cl_init_zero(pRExC_state, &accum);
2557
2558 while (OP(scan) == code) {
2559 I32 deltanext, minnext, f = 0, fake;
2560 struct regnode_charclass_class this_class;
2561
2562 num++;
2563 data_fake.flags = 0;
2564 if (data) {
2565 data_fake.whilem_c = data->whilem_c;
2566 data_fake.last_closep = data->last_closep;
2567 }
2568 else
2569 data_fake.last_closep = &fake;
58e23c8d
YO
2570
2571 data_fake.pos_delta = delta;
8aa23a47
YO
2572 next = regnext(scan);
2573 scan = NEXTOPER(scan);
2574 if (code != BRANCH)
c277df42 2575 scan = NEXTOPER(scan);
8aa23a47
YO
2576 if (flags & SCF_DO_STCLASS) {
2577 cl_init(pRExC_state, &this_class);
2578 data_fake.start_class = &this_class;
2579 f = SCF_DO_STCLASS_AND;
58e23c8d 2580 }
8aa23a47
YO
2581 if (flags & SCF_WHILEM_VISITED_POS)
2582 f |= SCF_WHILEM_VISITED_POS;
2583
2584 /* we suppose the run is continuous, last=next...*/
2585 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2586 next, &data_fake,
2587 stopparen, recursed, NULL, f,depth+1);
2588 if (min1 > minnext)
2589 min1 = minnext;
2590 if (max1 < minnext + deltanext)
2591 max1 = minnext + deltanext;
2592 if (deltanext == I32_MAX)
2593 is_inf = is_inf_internal = 1;
2594 scan = next;
2595 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2596 pars++;
2597 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2598 if ( stopmin > minnext)
2599 stopmin = min + min1;
2600 flags &= ~SCF_DO_SUBSTR;
2601 if (data)
2602 data->flags |= SCF_SEEN_ACCEPT;
2603 }
2604 if (data) {
2605 if (data_fake.flags & SF_HAS_EVAL)
2606 data->flags |= SF_HAS_EVAL;
2607 data->whilem_c = data_fake.whilem_c;
3dab1dad 2608 }
8aa23a47
YO
2609 if (flags & SCF_DO_STCLASS)
2610 cl_or(pRExC_state, &accum, &this_class);
2611 }
2612 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2613 min1 = 0;
2614 if (flags & SCF_DO_SUBSTR) {
2615 data->pos_min += min1;
2616 data->pos_delta += max1 - min1;
2617 if (max1 != min1 || is_inf)
2618 data->longest = &(data->longest_float);
2619 }
2620 min += min1;
2621 delta += max1 - min1;
2622 if (flags & SCF_DO_STCLASS_OR) {
2623 cl_or(pRExC_state, data->start_class, &accum);
2624 if (min1) {
2625 cl_and(data->start_class, and_withp);
2626 flags &= ~SCF_DO_STCLASS;
653099ff 2627 }
8aa23a47
YO
2628 }
2629 else if (flags & SCF_DO_STCLASS_AND) {
2630 if (min1) {
2631 cl_and(data->start_class, &accum);
2632 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2633 }
8aa23a47
YO
2634 else {
2635 /* Switch to OR mode: cache the old value of
2636 * data->start_class */
2637 INIT_AND_WITHP;
2638 StructCopy(data->start_class, and_withp,
2639 struct regnode_charclass_class);
2640 flags &= ~SCF_DO_STCLASS_AND;
2641 StructCopy(&accum, data->start_class,
2642 struct regnode_charclass_class);
2643 flags |= SCF_DO_STCLASS_OR;
2644 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2645 }
8aa23a47 2646 }
a3621e74 2647
8aa23a47
YO
2648 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2649 /* demq.
a3621e74 2650
8aa23a47
YO
2651 Assuming this was/is a branch we are dealing with: 'scan' now
2652 points at the item that follows the branch sequence, whatever
2653 it is. We now start at the beginning of the sequence and look
2654 for subsequences of
a3621e74 2655
8aa23a47
YO
2656 BRANCH->EXACT=>x1
2657 BRANCH->EXACT=>x2
2658 tail
a3621e74 2659
8aa23a47 2660 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2661
8aa23a47
YO
2662 If we can find such a subseqence we need to turn the first
2663 element into a trie and then add the subsequent branch exact
2664 strings to the trie.
a3621e74 2665
8aa23a47 2666 We have two cases
a3621e74 2667
8aa23a47 2668 1. patterns where the whole set of branch can be converted.
a3621e74 2669
8aa23a47 2670 2. patterns where only a subset can be converted.
a3621e74 2671
8aa23a47
YO
2672 In case 1 we can replace the whole set with a single regop
2673 for the trie. In case 2 we need to keep the start and end
2674 branchs so
a3621e74 2675
8aa23a47
YO
2676 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2677 becomes BRANCH TRIE; BRANCH X;
786e8c11 2678
8aa23a47
YO
2679 There is an additional case, that being where there is a
2680 common prefix, which gets split out into an EXACT like node
2681 preceding the TRIE node.
a3621e74 2682
8aa23a47
YO
2683 If x(1..n)==tail then we can do a simple trie, if not we make
2684 a "jump" trie, such that when we match the appropriate word
2685 we "jump" to the appopriate tail node. Essentailly we turn
2686 a nested if into a case structure of sorts.
b515a41d 2687
8aa23a47
YO
2688 */
2689
2690 int made=0;
2691 if (!re_trie_maxbuff) {
2692 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2693 if (!SvIOK(re_trie_maxbuff))
2694 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2695 }
2696 if ( SvIV(re_trie_maxbuff)>=0 ) {
2697 regnode *cur;
2698 regnode *first = (regnode *)NULL;
2699 regnode *last = (regnode *)NULL;
2700 regnode *tail = scan;
2701 U8 optype = 0;
2702 U32 count=0;
a3621e74
YO
2703
2704#ifdef DEBUGGING
8aa23a47 2705 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2706#endif
8aa23a47
YO
2707 /* var tail is used because there may be a TAIL
2708 regop in the way. Ie, the exacts will point to the
2709 thing following the TAIL, but the last branch will
2710 point at the TAIL. So we advance tail. If we
2711 have nested (?:) we may have to move through several
2712 tails.
2713 */
2714
2715 while ( OP( tail ) == TAIL ) {
2716 /* this is the TAIL generated by (?:) */
2717 tail = regnext( tail );
2718 }
a3621e74 2719
8aa23a47
YO
2720
2721 DEBUG_OPTIMISE_r({
2722 regprop(RExC_rx, mysv, tail );
2723 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2724 (int)depth * 2 + 2, "",
2725 "Looking for TRIE'able sequences. Tail node is: ",
2726 SvPV_nolen_const( mysv )
2727 );
2728 });
2729
2730 /*
2731
2732 step through the branches, cur represents each
2733 branch, noper is the first thing to be matched
2734 as part of that branch and noper_next is the
2735 regnext() of that node. if noper is an EXACT
2736 and noper_next is the same as scan (our current
2737 position in the regex) then the EXACT branch is
2738 a possible optimization target. Once we have
2739 two or more consequetive such branches we can
2740 create a trie of the EXACT's contents and stich
2741 it in place. If the sequence represents all of
2742 the branches we eliminate the whole thing and
2743 replace it with a single TRIE. If it is a
2744 subsequence then we need to stitch it in. This
2745 means the first branch has to remain, and needs
2746 to be repointed at the item on the branch chain
2747 following the last branch optimized. This could
2748 be either a BRANCH, in which case the
2749 subsequence is internal, or it could be the
2750 item following the branch sequence in which
2751 case the subsequence is at the end.
2752
2753 */
2754
2755 /* dont use tail as the end marker for this traverse */
2756 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2757 regnode * const noper = NEXTOPER( cur );
b515a41d 2758#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2759 regnode * const noper_next = regnext( noper );
b515a41d
YO
2760#endif
2761
8aa23a47
YO
2762 DEBUG_OPTIMISE_r({
2763 regprop(RExC_rx, mysv, cur);
2764 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2765 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2766
2767 regprop(RExC_rx, mysv, noper);
2768 PerlIO_printf( Perl_debug_log, " -> %s",
2769 SvPV_nolen_const(mysv));
2770
2771 if ( noper_next ) {
2772 regprop(RExC_rx, mysv, noper_next );
2773 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2774 SvPV_nolen_const(mysv));
2775 }
2776 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2777 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2778 });
2779 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2780 : PL_regkind[ OP( noper ) ] == EXACT )
2781 || OP(noper) == NOTHING )
786e8c11 2782#ifdef NOJUMPTRIE
8aa23a47 2783 && noper_next == tail
786e8c11 2784#endif
8aa23a47
YO
2785 && count < U16_MAX)
2786 {
2787 count++;
2788 if ( !first || optype == NOTHING ) {
2789 if (!first) first = cur;
2790 optype = OP( noper );
2791 } else {
2792 last = cur;
2793 }
2794 } else {
a0a388a1
YO
2795/*
2796 Currently we assume that the trie can handle unicode and ascii
2797 matches fold cased matches. If this proves true then the following
2798 define will prevent tries in this situation.
2799
2800 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2801*/
2802#define TRIE_TYPE_IS_SAFE 1
2803 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2804 make_trie( pRExC_state,
2805 startbranch, first, cur, tail, count,
2806 optype, depth+1 );
2807 }
2808 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2809#ifdef NOJUMPTRIE
8aa23a47 2810 && noper_next == tail
786e8c11 2811#endif
8aa23a47
YO
2812 ){
2813 count = 1;
2814 first = cur;
2815 optype = OP( noper );
2816 } else {
2817 count = 0;
2818 first = NULL;
2819 optype = 0;
2820 }
2821 last = NULL;
2822 }
2823 }
2824 DEBUG_OPTIMISE_r({
2825 regprop(RExC_rx, mysv, cur);
2826 PerlIO_printf( Perl_debug_log,
2827 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2828 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2829
2830 });
a0a388a1
YO
2831
2832 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2833 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2834#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2835 if ( ((made == MADE_EXACT_TRIE &&
2836 startbranch == first)
2837 || ( first_non_open == first )) &&
2838 depth==0 ) {
2839 flags |= SCF_TRIE_RESTUDY;
2840 if ( startbranch == first
2841 && scan == tail )
2842 {
2843 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2844 }
2845 }
3dab1dad 2846#endif
8aa23a47
YO
2847 }
2848 }
2849
2850 } /* do trie */
2851
653099ff 2852 }
8aa23a47
YO
2853 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2854 scan = NEXTOPER(NEXTOPER(scan));
2855 } else /* single branch is optimized. */
2856 scan = NEXTOPER(scan);
2857 continue;
2858 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2859 scan_frame *newframe = NULL;
2860 I32 paren;
2861 regnode *start;
2862 regnode *end;
2863
2864 if (OP(scan) != SUSPEND) {
2865 /* set the pointer */
2866 if (OP(scan) == GOSUB) {
2867 paren = ARG(scan);
2868 RExC_recurse[ARG2L(scan)] = scan;
2869 start = RExC_open_parens[paren-1];
2870 end = RExC_close_parens[paren-1];
2871 } else {
2872 paren = 0;
f8fc2ecf 2873 start = RExC_rxi->program + 1;
8aa23a47
YO
2874 end = RExC_opend;
2875 }
2876 if (!recursed) {
2877 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2878 SAVEFREEPV(recursed);
2879 }
2880 if (!PAREN_TEST(recursed,paren+1)) {
2881 PAREN_SET(recursed,paren+1);
2882 Newx(newframe,1,scan_frame);
2883 } else {
2884 if (flags & SCF_DO_SUBSTR) {
304ee84b 2885 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2886 data->longest = &(data->longest_float);
2887 }
2888 is_inf = is_inf_internal = 1;
2889 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2890 cl_anything(pRExC_state, data->start_class);
2891 flags &= ~SCF_DO_STCLASS;
2892 }
2893 } else {
2894 Newx(newframe,1,scan_frame);
2895 paren = stopparen;
2896 start = scan+2;
2897 end = regnext(scan);
2898 }
2899 if (newframe) {
2900 assert(start);
2901 assert(end);
2902 SAVEFREEPV(newframe);
2903 newframe->next = regnext(scan);
2904 newframe->last = last;
2905 newframe->stop = stopparen;
2906 newframe->prev = frame;
2907
2908 frame = newframe;
2909 scan = start;
2910 stopparen = paren;
2911 last = end;
2912
2913 continue;
2914 }
2915 }
2916 else if (OP(scan) == EXACT) {
2917 I32 l = STR_LEN(scan);
2918 UV uc;
2919 if (UTF) {
2920 const U8 * const s = (U8*)STRING(scan);
2921 l = utf8_length(s, s + l);
2922 uc = utf8_to_uvchr(s, NULL);
2923 } else {
2924 uc = *((U8*)STRING(scan));
2925 }
2926 min += l;
2927 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2928 /* The code below prefers earlier match for fixed
2929 offset, later match for variable offset. */
2930 if (data->last_end == -1) { /* Update the start info. */
2931 data->last_start_min = data->pos_min;
2932 data->last_start_max = is_inf
2933 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2934 }
8aa23a47
YO
2935 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2936 if (UTF)
2937 SvUTF8_on(data->last_found);
2938 {
2939 SV * const sv = data->last_found;
2940 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2941 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2942 if (mg && mg->mg_len >= 0)
2943 mg->mg_len += utf8_length((U8*)STRING(scan),
2944 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2945 }
8aa23a47
YO
2946 data->last_end = data->pos_min + l;
2947 data->pos_min += l; /* As in the first entry. */
2948 data->flags &= ~SF_BEFORE_EOL;
2949 }
2950 if (flags & SCF_DO_STCLASS_AND) {
2951 /* Check whether it is compatible with what we know already! */
2952 int compat = 1;
2953
2954 if (uc >= 0x100 ||
2955 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2956 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2957 && (!(data->start_class->flags & ANYOF_FOLD)
2958 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2959 )
2960 compat = 0;
2961 ANYOF_CLASS_ZERO(data->start_class);
2962 ANYOF_BITMAP_ZERO(data->start_class);
2963 if (compat)
2964 ANYOF_BITMAP_SET(data->start_class, uc);
2965 data->start_class->flags &= ~ANYOF_EOS;
2966 if (uc < 0x100)
2967 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2968 }
2969 else if (flags & SCF_DO_STCLASS_OR) {
2970 /* false positive possible if the class is case-folded */
2971 if (uc < 0x100)
2972 ANYOF_BITMAP_SET(data->start_class, uc);
2973 else
2974 data->start_class->flags |= ANYOF_UNICODE_ALL;
2975 data->start_class->flags &= ~ANYOF_EOS;
2976 cl_and(data->start_class, and_withp);
2977 }
2978 flags &= ~SCF_DO_STCLASS;
2979 }
2980 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2981 I32 l = STR_LEN(scan);
2982 UV uc = *((U8*)STRING(scan));
2983
2984 /* Search for fixed substrings supports EXACT only. */
2985 if (flags & SCF_DO_SUBSTR) {
2986 assert(data);
304ee84b 2987 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2988 }
2989 if (UTF) {
2990 const U8 * const s = (U8 *)STRING(scan);
2991 l = utf8_length(s, s + l);
2992 uc = utf8_to_uvchr(s, NULL);
2993 }
2994 min += l;
2995 if (flags & SCF_DO_SUBSTR)
2996 data->pos_min += l;
2997 if (flags & SCF_DO_STCLASS_AND) {
2998 /* Check whether it is compatible with what we know already! */
2999 int compat = 1;
3000
3001 if (uc >= 0x100 ||
3002 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3003 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3004 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3005 compat = 0;
3006 ANYOF_CLASS_ZERO(data->start_class);
3007 ANYOF_BITMAP_ZERO(data->start_class);
3008 if (compat) {
3009 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3010 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3011 data->start_class->flags |= ANYOF_FOLD;
3012 if (OP(scan) == EXACTFL)
3013 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3014 }
8aa23a47
YO
3015 }
3016 else if (flags & SCF_DO_STCLASS_OR) {
3017 if (data->start_class->flags & ANYOF_FOLD) {
3018 /* false positive possible if the class is case-folded.
3019 Assume that the locale settings are the same... */
1aa99e6b
IH
3020 if (uc < 0x100)
3021 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3022 data->start_class->flags &= ~ANYOF_EOS;
3023 }
8aa23a47 3024 cl_and(data->start_class, and_withp);
653099ff 3025 }
8aa23a47
YO
3026 flags &= ~SCF_DO_STCLASS;
3027 }
3028 else if (strchr((const char*)PL_varies,OP(scan))) {
3029 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3030 I32 f = flags, pos_before = 0;
3031 regnode * const oscan = scan;
3032 struct regnode_charclass_class this_class;
3033 struct regnode_charclass_class *oclass = NULL;
3034 I32 next_is_eval = 0;
3035
3036 switch (PL_regkind[OP(scan)]) {
3037 case WHILEM: /* End of (?:...)* . */
3038 scan = NEXTOPER(scan);
3039 goto finish;
3040 case PLUS:
3041 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3042 next = NEXTOPER(scan);
3043 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3044 mincount = 1;
3045 maxcount = REG_INFTY;
3046 next = regnext(scan);
3047 scan = NEXTOPER(scan);
3048 goto do_curly;
3049 }
3050 }
3051 if (flags & SCF_DO_SUBSTR)
3052 data->pos_min++;
3053 min++;
3054 /* Fall through. */
3055 case STAR:
3056 if (flags & SCF_DO_STCLASS) {
3057 mincount = 0;
3058 maxcount = REG_INFTY;
3059 next = regnext(scan);
3060 scan = NEXTOPER(scan);
3061 goto do_curly;
3062 }
3063 is_inf = is_inf_internal = 1;
3064 scan = regnext(scan);
c277df42 3065 if (flags & SCF_DO_SUBSTR) {
304ee84b 3066 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3067 data->longest = &(data->longest_float);
c277df42 3068 }
8aa23a47
YO
3069 goto optimize_curly_tail;
3070 case CURLY:
3071 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3072 && (scan->flags == stopparen))
3073 {
3074 mincount = 1;
3075 maxcount = 1;
3076 } else {
3077 mincount = ARG1(scan);
3078 maxcount = ARG2(scan);
653099ff 3079 }
8aa23a47
YO
3080 next = regnext(scan);
3081 if (OP(scan) == CURLYX) {
3082 I32 lp = (data ? *(data->last_closep) : 0);
3083 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3084 }
8aa23a47
YO
3085 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3086 next_is_eval = (OP(scan) == EVAL);
3087 do_curly:
3088 if (flags & SCF_DO_SUBSTR) {
304ee84b 3089 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3090 pos_before = data->pos_min;
b45f050a 3091 }
8aa23a47
YO
3092 if (data) {
3093 fl = data->flags;
3094 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3095 if (is_inf)
3096 data->flags |= SF_IS_INF;
3097 }
3098 if (flags & SCF_DO_STCLASS) {
3099 cl_init(pRExC_state, &this_class);
3100 oclass = data->start_class;
3101 data->start_class = &this_class;
3102 f |= SCF_DO_STCLASS_AND;
3103 f &= ~SCF_DO_STCLASS_OR;
3104 }
3105 /* These are the cases when once a subexpression
3106 fails at a particular position, it cannot succeed
3107 even after backtracking at the enclosing scope.
3108
3109 XXXX what if minimal match and we are at the
3110 initial run of {n,m}? */
3111 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3112 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3113
8aa23a47
YO
3114 /* This will finish on WHILEM, setting scan, or on NULL: */
3115 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3116 last, data, stopparen, recursed, NULL,
3117 (mincount == 0
3118 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3119
8aa23a47
YO
3120 if (flags & SCF_DO_STCLASS)
3121 data->start_class = oclass;
3122 if (mincount == 0 || minnext == 0) {
3123 if (flags & SCF_DO_STCLASS_OR) {
3124 cl_or(pRExC_state, data->start_class, &this_class);
3125 }
3126 else if (flags & SCF_DO_STCLASS_AND) {
3127 /* Switch to OR mode: cache the old value of
3128 * data->start_class */
3129 INIT_AND_WITHP;
3130 StructCopy(data->start_class, and_withp,
3131 struct regnode_charclass_class);
3132 flags &= ~SCF_DO_STCLASS_AND;
3133 StructCopy(&this_class, data->start_class,
3134 struct regnode_charclass_class);
3135 flags |= SCF_DO_STCLASS_OR;
3136 data->start_class->flags |= ANYOF_EOS;
3137 }
3138 } else { /* Non-zero len */
3139 if (flags & SCF_DO_STCLASS_OR) {
3140 cl_or(pRExC_state, data->start_class, &this_class);
3141 cl_and(data->start_class, and_withp);
3142 }
3143 else if (flags & SCF_DO_STCLASS_AND)
3144 cl_and(data->start_class, &this_class);
3145 flags &= ~SCF_DO_STCLASS;
3146 }
3147 if (!scan) /* It was not CURLYX, but CURLY. */
3148 scan = next;
3149 if ( /* ? quantifier ok, except for (?{ ... }) */
3150 (next_is_eval || !(mincount == 0 && maxcount == 1))
3151 && (minnext == 0) && (deltanext == 0)
3152 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3153 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3154 && ckWARN(WARN_REGEXP))
3155 {
3156 vWARN(RExC_parse,
3157 "Quantifier unexpected on zero-length expression");
3158 }
3159
3160 min += minnext * mincount;
3161 is_inf_internal |= ((maxcount == REG_INFTY
3162 && (minnext + deltanext) > 0)
3163 || deltanext == I32_MAX);
3164 is_inf |= is_inf_internal;
3165 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3166
3167 /* Try powerful optimization CURLYX => CURLYN. */
3168 if ( OP(oscan) == CURLYX && data
3169 && data->flags & SF_IN_PAR
3170 && !(data->flags & SF_HAS_EVAL)
3171 && !deltanext && minnext == 1 ) {
3172 /* Try to optimize to CURLYN. */
3173 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3174 regnode * const nxt1 = nxt;
497b47a8 3175#ifdef DEBUGGING
8aa23a47 3176 regnode *nxt2;
497b47a8 3177#endif
c277df42 3178
8aa23a47
YO
3179 /* Skip open. */
3180 nxt = regnext(nxt);
3181 if (!strchr((const char*)PL_simple,OP(nxt))
3182 && !(PL_regkind[OP(nxt)] == EXACT
3183 && STR_LEN(nxt) == 1))
3184 goto nogo;
497b47a8 3185#ifdef DEBUGGING
8aa23a47 3186 nxt2 = nxt;
497b47a8 3187#endif
8aa23a47
YO
3188 nxt = regnext(nxt);
3189 if (OP(nxt) != CLOSE)
3190 goto nogo;
3191 if (RExC_open_parens) {
3192 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3193 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3194 }
3195 /* Now we know that nxt2 is the only contents: */
3196 oscan->flags = (U8)ARG(nxt);
3197 OP(oscan) = CURLYN;
3198 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3199
c277df42 3200#ifdef DEBUGGING
8aa23a47
YO
3201 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3202 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3203 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3204 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3205 OP(nxt + 1) = OPTIMIZED; /* was count. */
3206 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3207#endif
8aa23a47
YO
3208 }
3209 nogo:
3210
3211 /* Try optimization CURLYX => CURLYM. */
3212 if ( OP(oscan) == CURLYX && data
3213 && !(data->flags & SF_HAS_PAR)
3214 && !(data->flags & SF_HAS_EVAL)
3215 && !deltanext /* atom is fixed width */
3216 && minnext != 0 /* CURLYM can't handle zero width */
3217 ) {
3218 /* XXXX How to optimize if data == 0? */
3219 /* Optimize to a simpler form. */
3220 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3221 regnode *nxt2;
3222
3223 OP(oscan) = CURLYM;
3224 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3225 && (OP(nxt2) != WHILEM))
3226 nxt = nxt2;
3227 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3228 /* Need to optimize away parenths. */
3229 if (data->flags & SF_IN_PAR) {
3230 /* Set the parenth number. */
3231 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3232
3233 if (OP(nxt) != CLOSE)
3234 FAIL("Panic opt close");
3235 oscan->flags = (U8)ARG(nxt);
3236 if (RExC_open_parens) {
3237 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3238 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3239 }
8aa23a47
YO
3240 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3241 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3242
c277df42 3243#ifdef DEBUGGING
8aa23a47
YO
3244 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3245 OP(nxt + 1) = OPTIMIZED; /* was count. */
3246 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3247 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3248#endif
c277df42 3249#if 0
8aa23a47
YO
3250 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3251 regnode *nnxt = regnext(nxt1);
3252
3253 if (nnxt == nxt) {
3254 if (reg_off_by_arg[OP(nxt1)])
3255 ARG_SET(nxt1, nxt2 - nxt1);
3256 else if (nxt2 - nxt1 < U16_MAX)
3257 NEXT_OFF(nxt1) = nxt2 - nxt1;
3258 else
3259 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3260 }
8aa23a47 3261 nxt1 = nnxt;
c277df42 3262 }
5d1c421c 3263#endif
8aa23a47
YO
3264 /* Optimize again: */
3265 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3266 NULL, stopparen, recursed, NULL, 0,depth+1);
3267 }
3268 else
3269 oscan->flags = 0;
3270 }
3271 else if ((OP(oscan) == CURLYX)
3272 && (flags & SCF_WHILEM_VISITED_POS)
3273 /* See the comment on a similar expression above.
3274 However, this time it not a subexpression
3275 we care about, but the expression itself. */
3276 && (maxcount == REG_INFTY)
3277 && data && ++data->whilem_c < 16) {
3278 /* This stays as CURLYX, we can put the count/of pair. */
3279 /* Find WHILEM (as in regexec.c) */
3280 regnode *nxt = oscan + NEXT_OFF(oscan);
3281
3282 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3283 nxt += ARG(nxt);
3284 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3285 | (RExC_whilem_seen << 4)); /* On WHILEM */
3286 }
3287 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3288 pars++;
3289 if (flags & SCF_DO_SUBSTR) {
3290 SV *last_str = NULL;
3291 int counted = mincount != 0;
a0ed51b3 3292
8aa23a47
YO
3293 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3294#if defined(SPARC64_GCC_WORKAROUND)
3295 I32 b = 0;
3296 STRLEN l = 0;
3297 const char *s = NULL;
3298 I32 old = 0;
b515a41d 3299
8aa23a47
YO
3300 if (pos_before >= data->last_start_min)
3301 b = pos_before;
3302 else
3303 b = data->last_start_min;
b515a41d 3304
8aa23a47
YO
3305 l = 0;
3306 s = SvPV_const(data->last_found, l);
3307 old = b - data->last_start_min;
3308
3309#else
3310 I32 b = pos_before >= data->last_start_min
3311 ? pos_before : data->last_start_min;
3312 STRLEN l;
3313 const char * const s = SvPV_const(data->last_found, l);
3314 I32 old = b - data->last_start_min;
3315#endif
3316
3317 if (UTF)
3318 old = utf8_hop((U8*)s, old) - (U8*)s;
3319
3320 l -= old;
3321 /* Get the added string: */
740cce10 3322 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3323 if (deltanext == 0 && pos_before == b) {
3324 /* What was added is a constant string */
3325 if (mincount > 1) {
3326 SvGROW(last_str, (mincount * l) + 1);
3327 repeatcpy(SvPVX(last_str) + l,
3328 SvPVX_const(last_str), l, mincount - 1);
3329 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3330 /* Add additional parts. */
3331 SvCUR_set(data->last_found,
3332 SvCUR(data->last_found) - l);
3333 sv_catsv(data->last_found, last_str);
3334 {
3335 SV * sv = data->last_found;
3336 MAGIC *mg =
3337 SvUTF8(sv) && SvMAGICAL(sv) ?
3338 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3339 if (mg && mg->mg_len >= 0)
bd94e887 3340 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3341 }
8aa23a47 3342 data->last_end += l * (mincount - 1);
b515a41d 3343 }
8aa23a47
YO
3344 } else {
3345 /* start offset must point into the last copy */
3346 data->last_start_min += minnext * (mincount - 1);
3347 data->last_start_max += is_inf ? I32_MAX
3348 : (maxcount - 1) * (minnext + data->pos_delta);
3349 }
c277df42 3350 }
8aa23a47
YO
3351 /* It is counted once already... */
3352 data->pos_min += minnext * (mincount - counted);
3353 data->pos_delta += - counted * deltanext +
3354 (minnext + deltanext) * maxcount - minnext * mincount;
3355 if (mincount != maxcount) {
3356 /* Cannot extend fixed substrings found inside
3357 the group. */
304ee84b 3358 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3359 if (mincount && last_str) {
3360 SV * const sv = data->last_found;
3361 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3362 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3363
3364 if (mg)
3365 mg->mg_len = -1;
3366 sv_setsv(sv, last_str);
3367 data->last_end = data->pos_min;
3368 data->last_start_min =
3369 data->pos_min - CHR_SVLEN(last_str);
3370 data->last_start_max = is_inf
3371 ? I32_MAX
3372 : data->pos_min + data->pos_delta
3373 - CHR_SVLEN(last_str);
3374 }
3375 data->longest = &(data->longest_float);
3376 }
3377 SvREFCNT_dec(last_str);
c277df42 3378 }
8aa23a47
YO
3379 if (data && (fl & SF_HAS_EVAL))
3380 data->flags |= SF_HAS_EVAL;
3381 optimize_curly_tail:
3382 if (OP(oscan) != CURLYX) {
3383 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3384 && NEXT_OFF(next))
3385 NEXT_OFF(oscan) += NEXT_OFF(next);
3386 }
3387 continue;
3388 default: /* REF and CLUMP only? */
3389 if (flags & SCF_DO_SUBSTR) {
304ee84b 3390 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3391 data->longest = &(data->longest_float);
3392 }
3393 is_inf = is_inf_internal = 1;
3394 if (flags & SCF_DO_STCLASS_OR)
3395 cl_anything(pRExC_state, data->start_class);
3396 flags &= ~SCF_DO_STCLASS;
3397 break;
c277df42 3398 }
8aa23a47 3399 }
e1d1eefb
YO
3400 else if (OP(scan) == LNBREAK) {
3401 if (flags & SCF_DO_STCLASS) {
3402 int value = 0;
3403 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3404 if (flags & SCF_DO_STCLASS_AND) {
3405 for (value = 0; value < 256; value++)
e64b1bd1 3406 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3407 ANYOF_BITMAP_CLEAR(data->start_class, value);
3408 }
3409 else {
3410 for (value = 0; value < 256; value++)
e64b1bd1 3411 if (is_VERTWS_cp(value))
e1d1eefb
YO
3412 ANYOF_BITMAP_SET(data->start_class, value);
3413 }
3414 if (flags & SCF_DO_STCLASS_OR)
3415 cl_and(data->start_class, and_withp);
3416 flags &= ~SCF_DO_STCLASS;
3417 }
3418 min += 1;
f9a79580 3419 delta += 1;
e1d1eefb
YO
3420 if (flags & SCF_DO_SUBSTR) {
3421 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3422 data->pos_min += 1;
f9a79580 3423 data->pos_delta += 1;
e1d1eefb
YO
3424 data->longest = &(data->longest_float);
3425 }
3426
3427 }
f9a79580
RGS
3428 else if (OP(scan) == FOLDCHAR) {
3429 int d = ARG(scan)==0xDF ? 1 : 2;
3430 flags &= ~SCF_DO_STCLASS;
3431 min += 1;
3432 delta += d;
3433 if (flags & SCF_DO_SUBSTR) {
3434 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3435 data->pos_min += 1;
3436 data->pos_delta += d;
3437 data->longest = &(data->longest_float);
3438 }
3439 }
8aa23a47
YO
3440 else if (strchr((const char*)PL_simple,OP(scan))) {
3441 int value = 0;
653099ff 3442
8aa23a47 3443 if (flags & SCF_DO_SUBSTR) {
304ee84b 3444 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3445 data->pos_min++;
3446 }
3447 min++;
3448 if (flags & SCF_DO_STCLASS) {
3449 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3450
8aa23a47
YO
3451 /* Some of the logic below assumes that switching
3452 locale on will only add false positives. */
3453 switch (PL_regkind[OP(scan)]) {
3454 case SANY:
3455 default:
3456 do_default:
3457 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3458 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3459 cl_anything(pRExC_state, data->start_class);
3460 break;
3461 case REG_ANY:
3462 if (OP(scan) == SANY)
3463 goto do_default;
3464 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3465 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3466 || (data->start_class->flags & ANYOF_CLASS));
3467 cl_anything(pRExC_state, data->start_class);
653099ff 3468 }
8aa23a47
YO
3469 if (flags & SCF_DO_STCLASS_AND || !value)
3470 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3471 break;
3472 case ANYOF:
3473 if (flags & SCF_DO_STCLASS_AND)
3474 cl_and(data->start_class,
3475 (struct regnode_charclass_class*)scan);
653099ff 3476 else
8aa23a47
YO
3477 cl_or(pRExC_state, data->start_class,
3478 (struct regnode_charclass_class*)scan);
3479 break;
3480 case ALNUM:
3481 if (flags & SCF_DO_STCLASS_AND) {
3482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3484 for (value = 0; value < 256; value++)
3485 if (!isALNUM(value))
3486 ANYOF_BITMAP_CLEAR(data->start_class, value);
3487 }
653099ff 3488 }
8aa23a47
YO
3489 else {
3490 if (data->start_class->flags & ANYOF_LOCALE)
3491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3492 else {
3493 for (value = 0; value < 256; value++)
3494 if (isALNUM(value))
3495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3496 }
8aa23a47
YO
3497 }
3498 break;
3499 case ALNUML:
3500 if (flags & SCF_DO_STCLASS_AND) {
3501 if (data->start_class->flags & ANYOF_LOCALE)
3502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3503 }
3504 else {
3505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3506 data->start_class->flags |= ANYOF_LOCALE;
3507 }
3508 break;
3509 case NALNUM:
3510 if (flags & SCF_DO_STCLASS_AND) {
3511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3513 for (value = 0; value < 256; value++)
3514 if (isALNUM(value))
3515 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3516 }
3517 }
8aa23a47
YO
3518 else {
3519 if (data->start_class->flags & ANYOF_LOCALE)
3520 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3521 else {
3522 for (value = 0; value < 256; value++)
3523 if (!isALNUM(value))
3524 ANYOF_BITMAP_SET(data->start_class, value);
3525 }
653099ff 3526 }
8aa23a47
YO
3527 break;
3528 case NALNUML:
3529 if (flags & SCF_DO_STCLASS_AND) {
3530 if (data->start_class->flags & ANYOF_LOCALE)
3531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3532 }
8aa23a47
YO
3533 else {
3534 data->start_class->flags |= ANYOF_LOCALE;
3535 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3536 }
3537 break;
3538 case SPACE:
3539 if (flags & SCF_DO_STCLASS_AND) {
3540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3542 for (value = 0; value < 256; value++)
3543 if (!isSPACE(value))
3544 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3545 }
3546 }
8aa23a47
YO
3547 else {
3548 if (data->start_class->flags & ANYOF_LOCALE)
3549 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3550 else {
3551 for (value = 0; value < 256; value++)
3552 if (isSPACE(value))
3553 ANYOF_BITMAP_SET(data->start_class, value);
3554 }
653099ff 3555 }
8aa23a47
YO
3556 break;
3557 case SPACEL:
3558 if (flags & SCF_DO_STCLASS_AND) {
3559 if (data->start_class->flags & ANYOF_LOCALE)
3560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3561 }
3562 else {
3563 data->start_class->flags |= ANYOF_LOCALE;
3564 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3565 }
3566 break;
3567 case NSPACE:
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3571 for (value = 0; value < 256; value++)
3572 if (isSPACE(value))
3573 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3574 }
8aa23a47
YO
3575 }
3576 else {
3577 if (data->start_class->flags & ANYOF_LOCALE)
3578 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3579 else {
3580 for (value = 0; value < 256; value++)
3581 if (!isSPACE(value))
3582 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3583 }
3584 }
8aa23a47
YO
3585 break;
3586 case NSPACEL:
3587 if (flags & SCF_DO_STCLASS_AND) {
3588 if (data->start_class->flags & ANYOF_LOCALE) {
3589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3590 for (value = 0; value < 256; value++)
3591 if (!isSPACE(value))
3592 ANYOF_BITMAP_CLEAR(data->start_class, value);
3593 }
653099ff 3594 }
8aa23a47
YO
3595 else {
3596 data->start_class->flags |= ANYOF_LOCALE;
3597 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3598 }
3599 break;
3600 case DIGIT:
3601 if (flags & SCF_DO_STCLASS_AND) {
3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3603 for (value = 0; value < 256; value++)
3604 if (!isDIGIT(value))
3605 ANYOF_BITMAP_CLEAR(data->start_class, value);
3606 }
3607 else {
3608 if (data->start_class->flags & ANYOF_LOCALE)
3609 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3610 else {
3611 for (value = 0; value < 256; value++)
3612 if (isDIGIT(value))
3613 ANYOF_BITMAP_SET(data->start_class, value);
3614 }
3615 }
3616 break;
3617 case NDIGIT:
3618 if (flags & SCF_DO_STCLASS_AND) {
3619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3620 for (value = 0; value < 256; value++)
3621 if (isDIGIT(value))
3622 ANYOF_BITMAP_CLEAR(data->start_class, value);
3623 }
3624 else {
3625 if (data->start_class->flags & ANYOF_LOCALE)
3626 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3627 else {
3628 for (value = 0; value < 256; value++)
3629 if (!isDIGIT(value))
3630 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3631 }
3632 }
8aa23a47 3633 break;
e1d1eefb
YO
3634 CASE_SYNST_FNC(VERTWS);
3635 CASE_SYNST_FNC(HORIZWS);
3636
8aa23a47
YO
3637 }
3638 if (flags & SCF_DO_STCLASS_OR)
3639 cl_and(data->start_class, and_withp);
3640 flags &= ~SCF_DO_STCLASS;
3641 }
3642 }
3643 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3644 data->flags |= (OP(scan) == MEOL
3645 ? SF_BEFORE_MEOL
3646 : SF_BEFORE_SEOL);
3647 }
3648 else if ( PL_regkind[OP(scan)] == BRANCHJ
3649 /* Lookbehind, or need to calculate parens/evals/stclass: */
3650 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3651 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3652 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3653 || OP(scan) == UNLESSM )
3654 {
3655 /* Negative Lookahead/lookbehind
3656 In this case we can't do fixed string optimisation.
3657 */
1de06328 3658
8aa23a47
YO
3659 I32 deltanext, minnext, fake = 0;
3660 regnode *nscan;
3661 struct regnode_charclass_class intrnl;
3662 int f = 0;
1de06328 3663
8aa23a47
YO
3664 data_fake.flags = 0;
3665 if (data) {
3666 data_fake.whilem_c = data->whilem_c;
3667 data_fake.last_closep = data->last_closep;
c277df42 3668 }
8aa23a47
YO
3669 else
3670 data_fake.last_closep = &fake;
58e23c8d 3671 data_fake.pos_delta = delta;
8aa23a47
YO
3672 if ( flags & SCF_DO_STCLASS && !scan->flags
3673 && OP(scan) == IFMATCH ) { /* Lookahead */
3674 cl_init(pRExC_state, &intrnl);
3675 data_fake.start_class = &intrnl;
3676 f |= SCF_DO_STCLASS_AND;
3677 }
3678 if (flags & SCF_WHILEM_VISITED_POS)
3679 f |= SCF_WHILEM_VISITED_POS;
3680 next = regnext(scan);
3681 nscan = NEXTOPER(NEXTOPER(scan));
3682 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3683 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3684 if (scan->flags) {
3685 if (deltanext) {
58e23c8d 3686 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3687 }
3688 else if (minnext > (I32)U8_MAX) {
58e23c8d 3689 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3690 }
3691 scan->flags = (U8)minnext;
3692 }
3693 if (data) {
3694 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3695 pars++;
3696 if (data_fake.flags & SF_HAS_EVAL)
3697 data->flags |= SF_HAS_EVAL;
3698 data->whilem_c = data_fake.whilem_c;
3699 }
3700 if (f & SCF_DO_STCLASS_AND) {
3701 const int was = (data->start_class->flags & ANYOF_EOS);
3702
3703 cl_and(data->start_class, &intrnl);
3704 if (was)
3705 data->start_class->flags |= ANYOF_EOS;
3706 }
cb434fcc 3707 }
8aa23a47
YO
3708#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3709 else {
3710 /* Positive Lookahead/lookbehind
3711 In this case we can do fixed string optimisation,
3712 but we must be careful about it. Note in the case of
3713 lookbehind the positions will be offset by the minimum
3714 length of the pattern, something we won't know about
3715 until after the recurse.
3716 */
3717 I32 deltanext, fake = 0;
3718 regnode *nscan;
3719 struct regnode_charclass_class intrnl;
3720 int f = 0;
3721 /* We use SAVEFREEPV so that when the full compile
3722 is finished perl will clean up the allocated
3723 minlens when its all done. This was we don't
3724 have to worry about freeing them when we know
3725 they wont be used, which would be a pain.
3726 */
3727 I32 *minnextp;
3728 Newx( minnextp, 1, I32 );
3729 SAVEFREEPV(minnextp);
3730
3731 if (data) {
3732 StructCopy(data, &data_fake, scan_data_t);
3733 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3734 f |= SCF_DO_SUBSTR;
3735 if (scan->flags)
304ee84b 3736 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3737 data_fake.last_found=newSVsv(data->last_found);
3738 }
3739 }
3740 else
3741 data_fake.last_closep = &fake;
3742 data_fake.flags = 0;
58e23c8d 3743 data_fake.pos_delta = delta;
8aa23a47
YO
3744 if (is_inf)
3745 data_fake.flags |= SF_IS_INF;
3746 if ( flags & SCF_DO_STCLASS && !scan->flags
3747 && OP(scan) == IFMATCH ) { /* Lookahead */
3748 cl_init(pRExC_state, &intrnl);
3749 data_fake.start_class = &intrnl;
3750 f |= SCF_DO_STCLASS_AND;
3751 }
3752 if (flags & SCF_WHILEM_VISITED_POS)
3753 f |= SCF_WHILEM_VISITED_POS;
3754 next = regnext(scan);
3755 nscan = NEXTOPER(NEXTOPER(scan));
3756
3757 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3758 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3759 if (scan->flags) {
3760 if (deltanext) {
58e23c8d 3761 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3762 }
3763 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3764 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3765 }
3766 scan->flags = (U8)*minnextp;
3767 }
3768
3769 *minnextp += min;
3770
3771 if (f & SCF_DO_STCLASS_AND) {
3772 const int was = (data->start_class->flags & ANYOF_EOS);
3773
3774 cl_and(data->start_class, &intrnl);
3775 if (was)
3776 data->start_class->flags |= ANYOF_EOS;
3777 }
3778 if (data) {
3779 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3780 pars++;
3781 if (data_fake.flags & SF_HAS_EVAL)
3782 data->flags |= SF_HAS_EVAL;
3783 data->whilem_c = data_fake.whilem_c;
3784 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3785 if (RExC_rx->minlen<*minnextp)
3786 RExC_rx->minlen=*minnextp;
304ee84b 3787 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3788 SvREFCNT_dec(data_fake.last_found);
3789
3790 if ( data_fake.minlen_fixed != minlenp )
3791 {
3792 data->offset_fixed= data_fake.offset_fixed;
3793 data->minlen_fixed= data_fake.minlen_fixed;
3794 data->lookbehind_fixed+= scan->flags;
3795 }
3796 if ( data_fake.minlen_float != minlenp )
3797 {
3798 data->minlen_float= data_fake.minlen_float;
3799 data->offset_float_min=data_fake.offset_float_min;
3800 data->offset_float_max=data_fake.offset_float_max;
3801 data->lookbehind_float+= scan->flags;
3802 }
3803 }
3804 }</