This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #56526] m/a{1,0}/ compiles but doesn't match a literal string
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
61 **** by Larry Wall and others
a687059c 62 ****
9ef589d8
LW
63 **** You may distribute under the terms of either the GNU General Public
64 **** License or the Artistic License, as specified in the README file.
65
a687059c
LW
66 *
67 * Beware that some of this code is subtly aware of the way operator
68 * precedence is structured in regular expressions. Serious changes in
69 * regular-expression syntax might require a total rethink.
70 */
71#include "EXTERN.h"
864dbfa3 72#define PERL_IN_REGCOMP_C
a687059c 73#include "perl.h"
d06ea78c 74
acfe0abc 75#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
76# include "INTERN.h"
77#endif
c277df42
IZ
78
79#define REG_COMP_C
54df2634
NC
80#ifdef PERL_IN_XSUB_RE
81# include "re_comp.h"
82#else
83# include "regcomp.h"
84#endif
a687059c 85
d4cce5f1 86#ifdef op
11343788 87#undef op
d4cce5f1 88#endif /* op */
11343788 89
fe14fcc3 90#ifdef MSDOS
7e4e8c89 91# if defined(BUGGY_MSC6)
fe14fcc3 92 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 93# pragma optimize("a",off)
fe14fcc3 94 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
95# pragma optimize("w",on )
96# endif /* BUGGY_MSC6 */
fe14fcc3
LW
97#endif /* MSDOS */
98
a687059c
LW
99#ifndef STATIC
100#define STATIC static
101#endif
102
830247a4 103typedef struct RExC_state_t {
e2509266 104 U32 flags; /* are we folding, multilining? */
830247a4 105 char *precomp; /* uncompiled string. */
288b8c02 106 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
107 regexp *rx; /* perl core regexp structure */
108 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 109 char *start; /* Start of input for compile */
830247a4
IZ
110 char *end; /* End of input for compile */
111 char *parse; /* Input-scan pointer. */
112 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 113 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 114 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 115 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
116 I32 naughty; /* How bad is this pattern? */
117 I32 sawback; /* Did we see \1, ...? */
118 U32 seen;
119 I32 size; /* Code size. */
c74340f9
YO
120 I32 npar; /* Capture buffer count, (OPEN). */
121 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 122 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
123 I32 extralen;
124 I32 seen_zerolen;
125 I32 seen_evals;
40d049e4
YO
126 regnode **open_parens; /* pointers to open parens */
127 regnode **close_parens; /* pointers to close parens */
128 regnode *opend; /* END node in program */
02daf0ab
YO
129 I32 utf8; /* whether the pattern is utf8 or not */
130 I32 orig_utf8; /* whether the pattern was originally in utf8 */
131 /* XXX use this for future optimisation of case
132 * where pattern must be upgraded to utf8. */
6bda09f9 133 HV *charnames; /* cache of named sequences */
81714fb9 134 HV *paren_names; /* Paren names */
1f1031fe 135
40d049e4
YO
136 regnode **recurse; /* Recurse regops */
137 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
138#if ADD_TO_REGEXEC
139 char *starttry; /* -Dr: where regtry was called. */
140#define RExC_starttry (pRExC_state->starttry)
141#endif
3dab1dad 142#ifdef DEBUGGING
be8e71aa 143 const char *lastparse;
3dab1dad 144 I32 lastnum;
1f1031fe 145 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
146#define RExC_lastparse (pRExC_state->lastparse)
147#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 148#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 149#endif
830247a4
IZ
150} RExC_state_t;
151
e2509266 152#define RExC_flags (pRExC_state->flags)
830247a4 153#define RExC_precomp (pRExC_state->precomp)
288b8c02 154#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 155#define RExC_rx (pRExC_state->rx)
f8fc2ecf 156#define RExC_rxi (pRExC_state->rxi)
fac92740 157#define RExC_start (pRExC_state->start)
830247a4
IZ
158#define RExC_end (pRExC_state->end)
159#define RExC_parse (pRExC_state->parse)
160#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
161#ifdef RE_TRACK_PATTERN_OFFSETS
162#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
163#endif
830247a4 164#define RExC_emit (pRExC_state->emit)
fac92740 165#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 166#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
167#define RExC_naughty (pRExC_state->naughty)
168#define RExC_sawback (pRExC_state->sawback)
169#define RExC_seen (pRExC_state->seen)
170#define RExC_size (pRExC_state->size)
171#define RExC_npar (pRExC_state->npar)
e2e6a0f1 172#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
173#define RExC_extralen (pRExC_state->extralen)
174#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
175#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 176#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 177#define RExC_orig_utf8 (pRExC_state->orig_utf8)
fc8cd66c 178#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
179#define RExC_open_parens (pRExC_state->open_parens)
180#define RExC_close_parens (pRExC_state->close_parens)
181#define RExC_opend (pRExC_state->opend)
81714fb9 182#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
183#define RExC_recurse (pRExC_state->recurse)
184#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 185
cde0cee5 186
a687059c
LW
187#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
a687059c 190
35c8bce7
LW
191#ifdef SPSTART
192#undef SPSTART /* dratted cpp namespace... */
193#endif
a687059c
LW
194/*
195 * Flags to be passed up and down.
196 */
a687059c 197#define WORST 0 /* Worst case. */
a3b492c3
YO
198#define HASWIDTH 0x01 /* Known to match non-null strings. */
199#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200#define SPSTART 0x04 /* Starts with * or +. */
201#define TRYAGAIN 0x08 /* Weeded out a declaration. */
202#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 203
3dab1dad
YO
204#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
205
07be1b83
YO
206/* whether trie related optimizations are enabled */
207#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208#define TRIE_STUDY_OPT
786e8c11 209#define FULL_TRIE_STUDY
07be1b83
YO
210#define TRIE_STCLASS
211#endif
1de06328
YO
212
213
40d049e4
YO
214
215#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216#define PBITVAL(paren) (1 << ((paren) & 7))
217#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
220
221
1de06328
YO
222/* About scan_data_t.
223
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
230 in the pattern:
231
232 /FOO[xX]A.*B[xX]BAR/
233
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
237
238 The strings can be composites, for instance
239
240 /(f)(o)(o)/
241
242 will result in a composite fixed substring 'foo'.
243
244 For each string some basic information is maintained:
245
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
252 found.
253
254 - max_offset
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
258
259 - minlenp
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
263 involved. Consider
264
265 /(?=FOO).*F/
266
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
277
278 - lookbehind
279
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
290 associated string.
291
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
294
295*/
2c2d71f5
JH
296
297typedef struct scan_data_t {
1de06328
YO
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
2c2d71f5
JH
300 I32 pos_min;
301 I32 pos_delta;
302 SV *last_found;
1de06328 303 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
304 I32 last_start_min;
305 I32 last_start_max;
1de06328
YO
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
316 I32 flags;
317 I32 whilem_c;
cb434fcc 318 I32 *last_closep;
653099ff 319 struct regnode_charclass_class *start_class;
2c2d71f5
JH
320} scan_data_t;
321
a687059c 322/*
e50aee73 323 * Forward declarations for pregcomp()'s friends.
a687059c 324 */
a0d0e21e 325
27da23d5 326static const scan_data_t zero_scan_data =
1de06328 327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
328
329#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
330#define SF_BEFORE_SEOL 0x0001
331#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
332#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
334
09b7f37c
CB
335#ifdef NO_UNARY_PLUS
336# define SF_FIX_SHIFT_EOL (0+2)
337# define SF_FL_SHIFT_EOL (0+4)
338#else
339# define SF_FIX_SHIFT_EOL (+2)
340# define SF_FL_SHIFT_EOL (+4)
341#endif
c277df42
IZ
342
343#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
345
346#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
348#define SF_IS_INF 0x0040
349#define SF_HAS_PAR 0x0080
350#define SF_IN_PAR 0x0100
351#define SF_HAS_EVAL 0x0200
352#define SCF_DO_SUBSTR 0x0400
653099ff
GS
353#define SCF_DO_STCLASS_AND 0x0800
354#define SCF_DO_STCLASS_OR 0x1000
355#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 356#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 357
786e8c11 358#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 359#define SCF_SEEN_ACCEPT 0x8000
07be1b83 360
eb160463 361#define UTF (RExC_utf8 != 0)
bbe252da
YO
362#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 364
ffc61ed2 365#define OOB_UNICODE 12345678
93733859 366#define OOB_NAMEDCLASS -1
b8c5462f 367
a0ed51b3
LW
368#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
370
8615cb43 371
b45f050a
JF
372/* length of regex to show in messages that don't mark a position within */
373#define RegexLengthToShowInErrorMessages 127
374
375/*
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
379 */
7253e4e3
RK
380#define MARKER1 "<-- HERE" /* marker as it appears in the description */
381#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 382
7253e4e3 383#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
384
385/*
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
388 * "...".
389 */
58e23c8d 390#define _FAIL(code) STMT_START { \
bfed75c6 391 const char *ellipses = ""; \
ccb2c380
MP
392 IV len = RExC_end - RExC_precomp; \
393 \
394 if (!SIZE_ONLY) \
288b8c02 395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
399 ellipses = "..."; \
400 } \
58e23c8d 401 code; \
ccb2c380 402} STMT_END
8615cb43 403
58e23c8d
YO
404#define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
407
408#define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
411
b45f050a 412/*
b45f050a
JF
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
414 */
ccb2c380 415#define Simple_vFAIL(m) STMT_START { \
a28509cc 416 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
423 */
ccb2c380
MP
424#define vFAIL(m) STMT_START { \
425 if (!SIZE_ONLY) \
288b8c02 426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
427 Simple_vFAIL(m); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Like Simple_vFAIL(), but accepts two arguments.
432 */
ccb2c380 433#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 434 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
441 */
ccb2c380
MP
442#define vFAIL2(m,a1) STMT_START { \
443 if (!SIZE_ONLY) \
288b8c02 444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
445 Simple_vFAIL2(m, a1); \
446} STMT_END
b45f050a
JF
447
448
449/*
450 * Like Simple_vFAIL(), but accepts three arguments.
451 */
ccb2c380 452#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 453 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
b45f050a
JF
457
458/*
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
460 */
ccb2c380
MP
461#define vFAIL3(m,a1,a2) STMT_START { \
462 if (!SIZE_ONLY) \
288b8c02 463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
464 Simple_vFAIL3(m, a1, a2); \
465} STMT_END
b45f050a
JF
466
467/*
468 * Like Simple_vFAIL(), but accepts four arguments.
469 */
ccb2c380 470#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 471 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
b45f050a 475
ccb2c380 476#define vWARN(loc,m) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
479 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
480} STMT_END
481
482#define vWARNdep(loc,m) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
484 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
485 "%s" REPORT_LOCATION, \
486 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
487} STMT_END
488
489
490#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 491 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
494} STMT_END
495
496#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 497 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
500} STMT_END
501
502#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 503 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
504 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
505 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
506} STMT_END
507
508#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 509 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
510 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
511 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
512} STMT_END
9d1d55b5 513
8615cb43 514
cd439c50 515/* Allow for side effects in s */
ccb2c380
MP
516#define REGC(c,s) STMT_START { \
517 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
518} STMT_END
cd439c50 519
fac92740
MJD
520/* Macros for recording node offsets. 20001227 mjd@plover.com
521 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
522 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
523 * Element 0 holds the number n.
07be1b83 524 * Position is 1 indexed.
fac92740 525 */
7122b237
YO
526#ifndef RE_TRACK_PATTERN_OFFSETS
527#define Set_Node_Offset_To_R(node,byte)
528#define Set_Node_Offset(node,byte)
529#define Set_Cur_Node_Offset
530#define Set_Node_Length_To_R(node,len)
531#define Set_Node_Length(node,len)
532#define Set_Node_Cur_Length(node)
533#define Node_Offset(n)
534#define Node_Length(n)
535#define Set_Node_Offset_Length(node,offset,len)
536#define ProgLen(ri) ri->u.proglen
537#define SetProgLen(ri,x) ri->u.proglen = x
538#else
539#define ProgLen(ri) ri->u.offsets[0]
540#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
541#define Set_Node_Offset_To_R(node,byte) STMT_START { \
542 if (! SIZE_ONLY) { \
543 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 544 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 545 if((node) < 0) { \
551405c4 546 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
547 } else { \
548 RExC_offsets[2*(node)-1] = (byte); \
549 } \
550 } \
551} STMT_END
552
553#define Set_Node_Offset(node,byte) \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
555#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
556
557#define Set_Node_Length_To_R(node,len) STMT_START { \
558 if (! SIZE_ONLY) { \
559 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 560 __LINE__, (int)(node), (int)(len))); \
ccb2c380 561 if((node) < 0) { \
551405c4 562 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
563 } else { \
564 RExC_offsets[2*(node)] = (len); \
565 } \
566 } \
567} STMT_END
568
569#define Set_Node_Length(node,len) \
570 Set_Node_Length_To_R((node)-RExC_emit_start, len)
571#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
572#define Set_Node_Cur_Length(node) \
573 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
574
575/* Get offsets and lengths */
576#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
577#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
578
07be1b83
YO
579#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
580 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
581 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
582} STMT_END
7122b237 583#endif
07be1b83
YO
584
585#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
586#define EXPERIMENTAL_INPLACESCAN
7122b237 587#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 588
304ee84b
YO
589#define DEBUG_STUDYDATA(str,data,depth) \
590DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 591 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
592 "%*s" str "Pos:%"IVdf"/%"IVdf \
593 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
594 (int)(depth)*2, "", \
595 (IV)((data)->pos_min), \
596 (IV)((data)->pos_delta), \
304ee84b 597 (UV)((data)->flags), \
1de06328 598 (IV)((data)->whilem_c), \
304ee84b
YO
599 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
600 is_inf ? "INF " : "" \
1de06328
YO
601 ); \
602 if ((data)->last_found) \
603 PerlIO_printf(Perl_debug_log, \
604 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
605 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
606 SvPVX_const((data)->last_found), \
607 (IV)((data)->last_end), \
608 (IV)((data)->last_start_min), \
609 (IV)((data)->last_start_max), \
610 ((data)->longest && \
611 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
612 SvPVX_const((data)->longest_fixed), \
613 (IV)((data)->offset_fixed), \
614 ((data)->longest && \
615 (data)->longest==&((data)->longest_float)) ? "*" : "", \
616 SvPVX_const((data)->longest_float), \
617 (IV)((data)->offset_float_min), \
618 (IV)((data)->offset_float_max) \
619 ); \
620 PerlIO_printf(Perl_debug_log,"\n"); \
621});
622
acfe0abc 623static void clear_re(pTHX_ void *r);
4327152a 624
653099ff 625/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 626 Update the longest found anchored substring and the longest found
653099ff
GS
627 floating substrings if needed. */
628
4327152a 629STATIC void
304ee84b 630S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 631{
e1ec3a88
AL
632 const STRLEN l = CHR_SVLEN(data->last_found);
633 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 634 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 635
7918f24d
NC
636 PERL_ARGS_ASSERT_SCAN_COMMIT;
637
c277df42 638 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 639 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
640 if (*data->longest == data->longest_fixed) {
641 data->offset_fixed = l ? data->last_start_min : data->pos_min;
642 if (data->flags & SF_BEFORE_EOL)
b81d288d 643 data->flags
c277df42
IZ
644 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
645 else
646 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
647 data->minlen_fixed=minlenp;
648 data->lookbehind_fixed=0;
a0ed51b3 649 }
304ee84b 650 else { /* *data->longest == data->longest_float */
c277df42 651 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
652 data->offset_float_max = (l
653 ? data->last_start_max
c277df42 654 : data->pos_min + data->pos_delta);
304ee84b 655 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 656 data->offset_float_max = I32_MAX;
c277df42 657 if (data->flags & SF_BEFORE_EOL)
b81d288d 658 data->flags
c277df42
IZ
659 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
660 else
661 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
662 data->minlen_float=minlenp;
663 data->lookbehind_float=0;
c277df42
IZ
664 }
665 }
666 SvCUR_set(data->last_found, 0);
0eda9292 667 {
a28509cc 668 SV * const sv = data->last_found;
097eb12c
AL
669 if (SvUTF8(sv) && SvMAGICAL(sv)) {
670 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
671 if (mg)
672 mg->mg_len = 0;
673 }
0eda9292 674 }
c277df42
IZ
675 data->last_end = -1;
676 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 677 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
678}
679
653099ff
GS
680/* Can match anything (initialization) */
681STATIC void
097eb12c 682S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 683{
7918f24d
NC
684 PERL_ARGS_ASSERT_CL_ANYTHING;
685
653099ff 686 ANYOF_CLASS_ZERO(cl);
f8bef550 687 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 688 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
689 if (LOC)
690 cl->flags |= ANYOF_LOCALE;
691}
692
693/* Can match anything (initialization) */
694STATIC int
5f66b61c 695S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
696{
697 int value;
698
7918f24d
NC
699 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
700
aaa51d5e 701 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
702 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
703 return 1;
1aa99e6b
IH
704 if (!(cl->flags & ANYOF_UNICODE_ALL))
705 return 0;
10edeb5d 706 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 707 return 0;
653099ff
GS
708 return 1;
709}
710
711/* Can match anything (initialization) */
712STATIC void
097eb12c 713S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 714{
7918f24d
NC
715 PERL_ARGS_ASSERT_CL_INIT;
716
8ecf7187 717 Zero(cl, 1, struct regnode_charclass_class);
653099ff 718 cl->type = ANYOF;
830247a4 719 cl_anything(pRExC_state, cl);
653099ff
GS
720}
721
722STATIC void
097eb12c 723S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 724{
7918f24d
NC
725 PERL_ARGS_ASSERT_CL_INIT_ZERO;
726
8ecf7187 727 Zero(cl, 1, struct regnode_charclass_class);
653099ff 728 cl->type = ANYOF;
830247a4 729 cl_anything(pRExC_state, cl);
653099ff
GS
730 if (LOC)
731 cl->flags |= ANYOF_LOCALE;
732}
733
734/* 'And' a given class with another one. Can create false positives */
735/* We assume that cl is not inverted */
736STATIC void
5f66b61c 737S_cl_and(struct regnode_charclass_class *cl,
a28509cc 738 const struct regnode_charclass_class *and_with)
653099ff 739{
7918f24d 740 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
741
742 assert(and_with->type == ANYOF);
653099ff
GS
743 if (!(and_with->flags & ANYOF_CLASS)
744 && !(cl->flags & ANYOF_CLASS)
745 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
746 && !(and_with->flags & ANYOF_FOLD)
747 && !(cl->flags & ANYOF_FOLD)) {
748 int i;
749
750 if (and_with->flags & ANYOF_INVERT)
751 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
752 cl->bitmap[i] &= ~and_with->bitmap[i];
753 else
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] &= and_with->bitmap[i];
756 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
757 if (!(and_with->flags & ANYOF_EOS))
758 cl->flags &= ~ANYOF_EOS;
1aa99e6b 759
14ebb1a2
JH
760 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
761 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
762 cl->flags &= ~ANYOF_UNICODE_ALL;
763 cl->flags |= ANYOF_UNICODE;
764 ARG_SET(cl, ARG(and_with));
765 }
14ebb1a2
JH
766 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
767 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 768 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
769 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
770 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 771 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
772}
773
774/* 'OR' a given class with another one. Can create false positives */
775/* We assume that cl is not inverted */
776STATIC void
097eb12c 777S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 778{
7918f24d
NC
779 PERL_ARGS_ASSERT_CL_OR;
780
653099ff
GS
781 if (or_with->flags & ANYOF_INVERT) {
782 /* We do not use
783 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
784 * <= (B1 | !B2) | (CL1 | !CL2)
785 * which is wasteful if CL2 is small, but we ignore CL2:
786 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
787 * XXXX Can we handle case-fold? Unclear:
788 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
789 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
790 */
791 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
792 && !(or_with->flags & ANYOF_FOLD)
793 && !(cl->flags & ANYOF_FOLD) ) {
794 int i;
795
796 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
797 cl->bitmap[i] |= ~or_with->bitmap[i];
798 } /* XXXX: logic is complicated otherwise */
799 else {
830247a4 800 cl_anything(pRExC_state, cl);
653099ff
GS
801 }
802 } else {
803 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
804 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 805 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
806 || (cl->flags & ANYOF_FOLD)) ) {
807 int i;
808
809 /* OR char bitmap and class bitmap separately */
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] |= or_with->bitmap[i];
812 if (or_with->flags & ANYOF_CLASS) {
813 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
814 cl->classflags[i] |= or_with->classflags[i];
815 cl->flags |= ANYOF_CLASS;
816 }
817 }
818 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 819 cl_anything(pRExC_state, cl);
653099ff
GS
820 }
821 }
822 if (or_with->flags & ANYOF_EOS)
823 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
824
825 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
826 ARG(cl) != ARG(or_with)) {
827 cl->flags |= ANYOF_UNICODE_ALL;
828 cl->flags &= ~ANYOF_UNICODE;
829 }
830 if (or_with->flags & ANYOF_UNICODE_ALL) {
831 cl->flags |= ANYOF_UNICODE_ALL;
832 cl->flags &= ~ANYOF_UNICODE;
833 }
653099ff
GS
834}
835
a3621e74
YO
836#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
837#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
838#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
839#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
840
3dab1dad
YO
841
842#ifdef DEBUGGING
07be1b83 843/*
2b8b4781
NC
844 dump_trie(trie,widecharmap,revcharmap)
845 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
846 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
847
848 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
849 The _interim_ variants are used for debugging the interim
850 tables that are used to generate the final compressed
851 representation which is what dump_trie expects.
852
3dab1dad
YO
853 Part of the reason for their existance is to provide a form
854 of documentation as to how the different representations function.
07be1b83
YO
855
856*/
3dab1dad
YO
857
858/*
3dab1dad
YO
859 Dumps the final compressed table form of the trie to Perl_debug_log.
860 Used for debugging make_trie().
861*/
862
863STATIC void
2b8b4781
NC
864S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
865 AV *revcharmap, U32 depth)
3dab1dad
YO
866{
867 U32 state;
ab3bbdeb 868 SV *sv=sv_newmortal();
55eed653 869 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
870 GET_RE_DEBUG_FLAGS_DECL;
871
7918f24d 872 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 873
3dab1dad
YO
874 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
875 (int)depth * 2 + 2,"",
876 "Match","Base","Ofs" );
877
878 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 879 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 880 if ( tmp ) {
ab3bbdeb
YO
881 PerlIO_printf( Perl_debug_log, "%*s",
882 colwidth,
ddc5bc0f 883 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
884 PL_colors[0], PL_colors[1],
885 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
886 PERL_PV_ESCAPE_FIRSTCHAR
887 )
888 );
3dab1dad
YO
889 }
890 }
891 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
892 (int)depth * 2 + 2,"");
893
894 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 895 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
896 PerlIO_printf( Perl_debug_log, "\n");
897
1e2e3d02 898 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 899 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
900
901 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
902
903 if ( trie->states[ state ].wordnum ) {
904 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
905 } else {
906 PerlIO_printf( Perl_debug_log, "%6s", "" );
907 }
908
909 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
910
911 if ( base ) {
912 U32 ofs = 0;
913
914 while( ( base + ofs < trie->uniquecharcount ) ||
915 ( base + ofs - trie->uniquecharcount < trie->lasttrans
916 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
917 ofs++;
918
919 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
920
921 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
922 if ( ( base + ofs >= trie->uniquecharcount ) &&
923 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
924 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
925 {
ab3bbdeb
YO
926 PerlIO_printf( Perl_debug_log, "%*"UVXf,
927 colwidth,
3dab1dad
YO
928 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
929 } else {
ab3bbdeb 930 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
931 }
932 }
933
934 PerlIO_printf( Perl_debug_log, "]");
935
936 }
937 PerlIO_printf( Perl_debug_log, "\n" );
938 }
939}
940/*
3dab1dad
YO
941 Dumps a fully constructed but uncompressed trie in list form.
942 List tries normally only are used for construction when the number of
943 possible chars (trie->uniquecharcount) is very high.
944 Used for debugging make_trie().
945*/
946STATIC void
55eed653 947S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
948 HV *widecharmap, AV *revcharmap, U32 next_alloc,
949 U32 depth)
3dab1dad
YO
950{
951 U32 state;
ab3bbdeb 952 SV *sv=sv_newmortal();
55eed653 953 int colwidth= widecharmap ? 6 : 4;
3dab1dad 954 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
955
956 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
957
3dab1dad 958 /* print out the table precompression. */
ab3bbdeb
YO
959 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
960 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
961 "------:-----+-----------------\n" );
3dab1dad
YO
962
963 for( state=1 ; state < next_alloc ; state ++ ) {
964 U16 charid;
965
ab3bbdeb 966 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
967 (int)depth * 2 + 2,"", (UV)state );
968 if ( ! trie->states[ state ].wordnum ) {
969 PerlIO_printf( Perl_debug_log, "%5s| ","");
970 } else {
971 PerlIO_printf( Perl_debug_log, "W%4x| ",
972 trie->states[ state ].wordnum
973 );
974 }
975 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 976 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
977 if ( tmp ) {
978 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
979 colwidth,
ddc5bc0f 980 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
981 PL_colors[0], PL_colors[1],
982 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
983 PERL_PV_ESCAPE_FIRSTCHAR
984 ) ,
1e2e3d02
YO
985 TRIE_LIST_ITEM(state,charid).forid,
986 (UV)TRIE_LIST_ITEM(state,charid).newstate
987 );
988 if (!(charid % 10))
664e119d
RGS
989 PerlIO_printf(Perl_debug_log, "\n%*s| ",
990 (int)((depth * 2) + 14), "");
1e2e3d02 991 }
ab3bbdeb
YO
992 }
993 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
994 }
995}
996
997/*
3dab1dad
YO
998 Dumps a fully constructed but uncompressed trie in table form.
999 This is the normal DFA style state transition table, with a few
1000 twists to facilitate compression later.
1001 Used for debugging make_trie().
1002*/
1003STATIC void
55eed653 1004S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1005 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1006 U32 depth)
3dab1dad
YO
1007{
1008 U32 state;
1009 U16 charid;
ab3bbdeb 1010 SV *sv=sv_newmortal();
55eed653 1011 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1012 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1013
1014 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1015
1016 /*
1017 print out the table precompression so that we can do a visual check
1018 that they are identical.
1019 */
1020
1021 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1022
1023 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1024 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1025 if ( tmp ) {
ab3bbdeb
YO
1026 PerlIO_printf( Perl_debug_log, "%*s",
1027 colwidth,
ddc5bc0f 1028 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1029 PL_colors[0], PL_colors[1],
1030 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031 PERL_PV_ESCAPE_FIRSTCHAR
1032 )
1033 );
3dab1dad
YO
1034 }
1035 }
1036
1037 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1038
1039 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1040 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1041 }
1042
1043 PerlIO_printf( Perl_debug_log, "\n" );
1044
1045 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1046
1047 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1048 (int)depth * 2 + 2,"",
1049 (UV)TRIE_NODENUM( state ) );
1050
1051 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1052 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1053 if (v)
1054 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1055 else
1056 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1057 }
1058 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1059 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1060 } else {
1061 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1062 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1063 }
1064 }
07be1b83 1065}
3dab1dad
YO
1066
1067#endif
1068
786e8c11
YO
1069/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1070 startbranch: the first branch in the whole branch sequence
1071 first : start branch of sequence of branch-exact nodes.
1072 May be the same as startbranch
1073 last : Thing following the last branch.
1074 May be the same as tail.
1075 tail : item following the branch sequence
1076 count : words in the sequence
1077 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1078 depth : indent depth
3dab1dad 1079
786e8c11 1080Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1081
786e8c11
YO
1082A trie is an N'ary tree where the branches are determined by digital
1083decomposition of the key. IE, at the root node you look up the 1st character and
1084follow that branch repeat until you find the end of the branches. Nodes can be
1085marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1086
786e8c11 1087 /he|she|his|hers/
72f13be8 1088
786e8c11
YO
1089would convert into the following structure. Numbers represent states, letters
1090following numbers represent valid transitions on the letter from that state, if
1091the number is in square brackets it represents an accepting state, otherwise it
1092will be in parenthesis.
07be1b83 1093
786e8c11
YO
1094 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1095 | |
1096 | (2)
1097 | |
1098 (1) +-i->(6)-+-s->[7]
1099 |
1100 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1101
786e8c11
YO
1102 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1103
1104This shows that when matching against the string 'hers' we will begin at state 1
1105read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1106then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1107is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1108single traverse. We store a mapping from accepting to state to which word was
1109matched, and then when we have multiple possibilities we try to complete the
1110rest of the regex in the order in which they occured in the alternation.
1111
1112The only prior NFA like behaviour that would be changed by the TRIE support is
1113the silent ignoring of duplicate alternations which are of the form:
1114
1115 / (DUPE|DUPE) X? (?{ ... }) Y /x
1116
1117Thus EVAL blocks follwing a trie may be called a different number of times with
1118and without the optimisation. With the optimisations dupes will be silently
1119ignored. This inconsistant behaviour of EVAL type nodes is well established as
1120the following demonstrates:
1121
1122 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1123
1124which prints out 'word' three times, but
1125
1126 'words'=~/(word|word|word)(?{ print $1 })S/
1127
1128which doesnt print it out at all. This is due to other optimisations kicking in.
1129
1130Example of what happens on a structural level:
1131
1132The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1133
1134 1: CURLYM[1] {1,32767}(18)
1135 5: BRANCH(8)
1136 6: EXACT <ac>(16)
1137 8: BRANCH(11)
1138 9: EXACT <ad>(16)
1139 11: BRANCH(14)
1140 12: EXACT <ab>(16)
1141 16: SUCCEED(0)
1142 17: NOTHING(18)
1143 18: END(0)
1144
1145This would be optimizable with startbranch=5, first=5, last=16, tail=16
1146and should turn into:
1147
1148 1: CURLYM[1] {1,32767}(18)
1149 5: TRIE(16)
1150 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1151 <ac>
1152 <ad>
1153 <ab>
1154 16: SUCCEED(0)
1155 17: NOTHING(18)
1156 18: END(0)
1157
1158Cases where tail != last would be like /(?foo|bar)baz/:
1159
1160 1: BRANCH(4)
1161 2: EXACT <foo>(8)
1162 4: BRANCH(7)
1163 5: EXACT <bar>(8)
1164 7: TAIL(8)
1165 8: EXACT <baz>(10)
1166 10: END(0)
1167
1168which would be optimizable with startbranch=1, first=1, last=7, tail=8
1169and would end up looking like:
1170
1171 1: TRIE(8)
1172 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1173 <foo>
1174 <bar>
1175 7: TAIL(8)
1176 8: EXACT <baz>(10)
1177 10: END(0)
1178
1179 d = uvuni_to_utf8_flags(d, uv, 0);
1180
1181is the recommended Unicode-aware way of saying
1182
1183 *(d++) = uv;
1184*/
1185
1e2e3d02 1186#define TRIE_STORE_REVCHAR \
786e8c11 1187 STMT_START { \
73031816
NC
1188 if (UTF) { \
1189 SV *zlopp = newSV(2); \
88c9ea1e
CB
1190 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1191 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1192 SvCUR_set(zlopp, kapow - flrbbbbb); \
1193 SvPOK_on(zlopp); \
1194 SvUTF8_on(zlopp); \
1195 av_push(revcharmap, zlopp); \
1196 } else { \
6bdeddd2 1197 char ooooff = (char)uvc; \
73031816
NC
1198 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1199 } \
1200 } STMT_END
786e8c11
YO
1201
1202#define TRIE_READ_CHAR STMT_START { \
1203 wordlen++; \
1204 if ( UTF ) { \
1205 if ( folder ) { \
1206 if ( foldlen > 0 ) { \
1207 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1208 foldlen -= len; \
1209 scan += len; \
1210 len = 0; \
1211 } else { \
1212 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1213 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1214 foldlen -= UNISKIP( uvc ); \
1215 scan = foldbuf + UNISKIP( uvc ); \
1216 } \
1217 } else { \
1218 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1219 } \
1220 } else { \
1221 uvc = (U32)*uc; \
1222 len = 1; \
1223 } \
1224} STMT_END
1225
1226
1227
1228#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1229 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1230 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1231 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1232 } \
1233 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1234 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1235 TRIE_LIST_CUR( state )++; \
1236} STMT_END
07be1b83 1237
786e8c11
YO
1238#define TRIE_LIST_NEW(state) STMT_START { \
1239 Newxz( trie->states[ state ].trans.list, \
1240 4, reg_trie_trans_le ); \
1241 TRIE_LIST_CUR( state ) = 1; \
1242 TRIE_LIST_LEN( state ) = 4; \
1243} STMT_END
07be1b83 1244
786e8c11
YO
1245#define TRIE_HANDLE_WORD(state) STMT_START { \
1246 U16 dupe= trie->states[ state ].wordnum; \
1247 regnode * const noper_next = regnext( noper ); \
1248 \
1249 if (trie->wordlen) \
1250 trie->wordlen[ curword ] = wordlen; \
1251 DEBUG_r({ \
1252 /* store the word for dumping */ \
1253 SV* tmp; \
1254 if (OP(noper) != NOTHING) \
740cce10 1255 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1256 else \
740cce10 1257 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1258 av_push( trie_words, tmp ); \
786e8c11
YO
1259 }); \
1260 \
1261 curword++; \
1262 \
1263 if ( noper_next < tail ) { \
1264 if (!trie->jump) \
c944940b 1265 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1266 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1267 if (!jumper) \
1268 jumper = noper_next; \
1269 if (!nextbranch) \
1270 nextbranch= regnext(cur); \
1271 } \
1272 \
1273 if ( dupe ) { \
1274 /* So it's a dupe. This means we need to maintain a */\
1275 /* linked-list from the first to the next. */\
1276 /* we only allocate the nextword buffer when there */\
1277 /* a dupe, so first time we have to do the allocation */\
1278 if (!trie->nextword) \
c944940b 1279 trie->nextword = (U16 *) \
446bd890 1280 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1281 while ( trie->nextword[dupe] ) \
1282 dupe= trie->nextword[dupe]; \
1283 trie->nextword[dupe]= curword; \
1284 } else { \
1285 /* we haven't inserted this word yet. */ \
1286 trie->states[ state ].wordnum = curword; \
1287 } \
1288} STMT_END
07be1b83 1289
3dab1dad 1290
786e8c11
YO
1291#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1292 ( ( base + charid >= ucharcount \
1293 && base + charid < ubound \
1294 && state == trie->trans[ base - ucharcount + charid ].check \
1295 && trie->trans[ base - ucharcount + charid ].next ) \
1296 ? trie->trans[ base - ucharcount + charid ].next \
1297 : ( state==1 ? special : 0 ) \
1298 )
3dab1dad 1299
786e8c11
YO
1300#define MADE_TRIE 1
1301#define MADE_JUMP_TRIE 2
1302#define MADE_EXACT_TRIE 4
3dab1dad 1303
a3621e74 1304STATIC I32
786e8c11 1305S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1306{
27da23d5 1307 dVAR;
a3621e74
YO
1308 /* first pass, loop through and scan words */
1309 reg_trie_data *trie;
55eed653 1310 HV *widecharmap = NULL;
2b8b4781 1311 AV *revcharmap = newAV();
a3621e74 1312 regnode *cur;
9f7f3913 1313 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1314 STRLEN len = 0;
1315 UV uvc = 0;
1316 U16 curword = 0;
1317 U32 next_alloc = 0;
786e8c11
YO
1318 regnode *jumper = NULL;
1319 regnode *nextbranch = NULL;
7f69552c 1320 regnode *convert = NULL;
a3621e74 1321 /* we just use folder as a flag in utf8 */
e1ec3a88 1322 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1323 ? PL_fold
1324 : ( flags == EXACTFL
1325 ? PL_fold_locale
1326 : NULL
1327 )
1328 );
1329
2b8b4781
NC
1330#ifdef DEBUGGING
1331 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1332 AV *trie_words = NULL;
1333 /* along with revcharmap, this only used during construction but both are
1334 * useful during debugging so we store them in the struct when debugging.
8e11feef 1335 */
2b8b4781
NC
1336#else
1337 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1338 STRLEN trie_charcount=0;
3dab1dad 1339#endif
2b8b4781 1340 SV *re_trie_maxbuff;
a3621e74 1341 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1342
1343 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1344#ifndef DEBUGGING
1345 PERL_UNUSED_ARG(depth);
1346#endif
a3621e74 1347
c944940b 1348 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1349 trie->refcount = 1;
3dab1dad 1350 trie->startstate = 1;
786e8c11 1351 trie->wordcount = word_count;
f8fc2ecf 1352 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1353 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1354 if (!(UTF && folder))
c944940b 1355 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1356 DEBUG_r({
2b8b4781 1357 trie_words = newAV();
a3621e74 1358 });
a3621e74 1359
0111c4fd 1360 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1361 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1362 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1363 }
3dab1dad
YO
1364 DEBUG_OPTIMISE_r({
1365 PerlIO_printf( Perl_debug_log,
786e8c11 1366 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1367 (int)depth * 2 + 2, "",
1368 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1369 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1370 (int)depth);
3dab1dad 1371 });
7f69552c
YO
1372
1373 /* Find the node we are going to overwrite */
1374 if ( first == startbranch && OP( last ) != BRANCH ) {
1375 /* whole branch chain */
1376 convert = first;
1377 } else {
1378 /* branch sub-chain */
1379 convert = NEXTOPER( first );
1380 }
1381
a3621e74
YO
1382 /* -- First loop and Setup --
1383
1384 We first traverse the branches and scan each word to determine if it
1385 contains widechars, and how many unique chars there are, this is
1386 important as we have to build a table with at least as many columns as we
1387 have unique chars.
1388
1389 We use an array of integers to represent the character codes 0..255
38a44b82 1390 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1391 native representation of the character value as the key and IV's for the
1392 coded index.
1393
1394 *TODO* If we keep track of how many times each character is used we can
1395 remap the columns so that the table compression later on is more
1396 efficient in terms of memory by ensuring most common value is in the
1397 middle and the least common are on the outside. IMO this would be better
1398 than a most to least common mapping as theres a decent chance the most
1399 common letter will share a node with the least common, meaning the node
1400 will not be compressable. With a middle is most common approach the worst
1401 case is when we have the least common nodes twice.
1402
1403 */
1404
a3621e74 1405 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1406 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1407 const U8 *uc = (U8*)STRING( noper );
a28509cc 1408 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1409 STRLEN foldlen = 0;
1410 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1411 const U8 *scan = (U8*)NULL;
07be1b83 1412 U32 wordlen = 0; /* required init */
02daf0ab
YO
1413 STRLEN chars = 0;
1414 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1415
3dab1dad
YO
1416 if (OP(noper) == NOTHING) {
1417 trie->minlen= 0;
1418 continue;
1419 }
02daf0ab
YO
1420 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1421 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1422 regardless of encoding */
1423
a3621e74 1424 for ( ; uc < e ; uc += len ) {
3dab1dad 1425 TRIE_CHARCOUNT(trie)++;
a3621e74 1426 TRIE_READ_CHAR;
3dab1dad 1427 chars++;
a3621e74
YO
1428 if ( uvc < 256 ) {
1429 if ( !trie->charmap[ uvc ] ) {
1430 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1431 if ( folder )
1432 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1433 TRIE_STORE_REVCHAR;
a3621e74 1434 }
02daf0ab
YO
1435 if ( set_bit ) {
1436 /* store the codepoint in the bitmap, and if its ascii
1437 also store its folded equivelent. */
1438 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1439
1440 /* store the folded codepoint */
1441 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1442
1443 if ( !UTF ) {
1444 /* store first byte of utf8 representation of
1445 codepoints in the 127 < uvc < 256 range */
1446 if (127 < uvc && uvc < 192) {
1447 TRIE_BITMAP_SET(trie,194);
1448 } else if (191 < uvc ) {
1449 TRIE_BITMAP_SET(trie,195);
1450 /* && uvc < 256 -- we know uvc is < 256 already */
1451 }
1452 }
02daf0ab
YO
1453 set_bit = 0; /* We've done our bit :-) */
1454 }
a3621e74
YO
1455 } else {
1456 SV** svpp;
55eed653
NC
1457 if ( !widecharmap )
1458 widecharmap = newHV();
a3621e74 1459
55eed653 1460 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1461
1462 if ( !svpp )
e4584336 1463 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1464
1465 if ( !SvTRUE( *svpp ) ) {
1466 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1467 TRIE_STORE_REVCHAR;
a3621e74
YO
1468 }
1469 }
1470 }
3dab1dad
YO
1471 if( cur == first ) {
1472 trie->minlen=chars;
1473 trie->maxlen=chars;
1474 } else if (chars < trie->minlen) {
1475 trie->minlen=chars;
1476 } else if (chars > trie->maxlen) {
1477 trie->maxlen=chars;
1478 }
1479
a3621e74
YO
1480 } /* end first pass */
1481 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1482 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1483 (int)depth * 2 + 2,"",
55eed653 1484 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1485 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1486 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1487 );
c944940b 1488 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1489
1490 /*
1491 We now know what we are dealing with in terms of unique chars and
1492 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1493 representation using a flat table will take. If it's over a reasonable
1494 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1495 conservative but potentially much slower representation using an array
1496 of lists.
1497
1498 At the end we convert both representations into the same compressed
1499 form that will be used in regexec.c for matching with. The latter
1500 is a form that cannot be used to construct with but has memory
1501 properties similar to the list form and access properties similar
1502 to the table form making it both suitable for fast searches and
1503 small enough that its feasable to store for the duration of a program.
1504
1505 See the comment in the code where the compressed table is produced
1506 inplace from the flat tabe representation for an explanation of how
1507 the compression works.
1508
1509 */
1510
1511
3dab1dad 1512 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1513 /*
1514 Second Pass -- Array Of Lists Representation
1515
1516 Each state will be represented by a list of charid:state records
1517 (reg_trie_trans_le) the first such element holds the CUR and LEN
1518 points of the allocated array. (See defines above).
1519
1520 We build the initial structure using the lists, and then convert
1521 it into the compressed table form which allows faster lookups
1522 (but cant be modified once converted).
a3621e74
YO
1523 */
1524
a3621e74
YO
1525 STRLEN transcount = 1;
1526
1e2e3d02
YO
1527 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1528 "%*sCompiling trie using list compiler\n",
1529 (int)depth * 2 + 2, ""));
446bd890 1530
c944940b
JH
1531 trie->states = (reg_trie_state *)
1532 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1533 sizeof(reg_trie_state) );
a3621e74
YO
1534 TRIE_LIST_NEW(1);
1535 next_alloc = 2;
1536
1537 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1538
c445ea15
AL
1539 regnode * const noper = NEXTOPER( cur );
1540 U8 *uc = (U8*)STRING( noper );
1541 const U8 * const e = uc + STR_LEN( noper );
1542 U32 state = 1; /* required init */
1543 U16 charid = 0; /* sanity init */
1544 U8 *scan = (U8*)NULL; /* sanity init */
1545 STRLEN foldlen = 0; /* required init */
07be1b83 1546 U32 wordlen = 0; /* required init */
c445ea15
AL
1547 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1548
3dab1dad 1549 if (OP(noper) != NOTHING) {
786e8c11 1550 for ( ; uc < e ; uc += len ) {
c445ea15 1551
786e8c11 1552 TRIE_READ_CHAR;
c445ea15 1553
786e8c11
YO
1554 if ( uvc < 256 ) {
1555 charid = trie->charmap[ uvc ];
c445ea15 1556 } else {
55eed653 1557 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1558 if ( !svpp ) {
1559 charid = 0;
1560 } else {
1561 charid=(U16)SvIV( *svpp );
1562 }
c445ea15 1563 }
786e8c11
YO
1564 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1565 if ( charid ) {
a3621e74 1566
786e8c11
YO
1567 U16 check;
1568 U32 newstate = 0;
a3621e74 1569
786e8c11
YO
1570 charid--;
1571 if ( !trie->states[ state ].trans.list ) {
1572 TRIE_LIST_NEW( state );
c445ea15 1573 }
786e8c11
YO
1574 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1575 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1576 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1577 break;
1578 }
1579 }
1580 if ( ! newstate ) {
1581 newstate = next_alloc++;
1582 TRIE_LIST_PUSH( state, charid, newstate );
1583 transcount++;
1584 }
1585 state = newstate;
1586 } else {
1587 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1588 }
a28509cc 1589 }
c445ea15 1590 }
3dab1dad 1591 TRIE_HANDLE_WORD(state);
a3621e74
YO
1592
1593 } /* end second pass */
1594
1e2e3d02
YO
1595 /* next alloc is the NEXT state to be allocated */
1596 trie->statecount = next_alloc;
c944940b
JH
1597 trie->states = (reg_trie_state *)
1598 PerlMemShared_realloc( trie->states,
1599 next_alloc
1600 * sizeof(reg_trie_state) );
a3621e74 1601
3dab1dad 1602 /* and now dump it out before we compress it */
2b8b4781
NC
1603 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1604 revcharmap, next_alloc,
1605 depth+1)
1e2e3d02 1606 );
a3621e74 1607
c944940b
JH
1608 trie->trans = (reg_trie_trans *)
1609 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1610 {
1611 U32 state;
a3621e74
YO
1612 U32 tp = 0;
1613 U32 zp = 0;
1614
1615
1616 for( state=1 ; state < next_alloc ; state ++ ) {
1617 U32 base=0;
1618
1619 /*
1620 DEBUG_TRIE_COMPILE_MORE_r(
1621 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1622 );
1623 */
1624
1625 if (trie->states[state].trans.list) {
1626 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1627 U16 maxid=minid;
a28509cc 1628 U16 idx;
a3621e74
YO
1629
1630 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1631 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1632 if ( forid < minid ) {
1633 minid=forid;
1634 } else if ( forid > maxid ) {
1635 maxid=forid;
1636 }
a3621e74
YO
1637 }
1638 if ( transcount < tp + maxid - minid + 1) {
1639 transcount *= 2;
c944940b
JH
1640 trie->trans = (reg_trie_trans *)
1641 PerlMemShared_realloc( trie->trans,
446bd890
NC
1642 transcount
1643 * sizeof(reg_trie_trans) );
a3621e74
YO
1644 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1645 }
1646 base = trie->uniquecharcount + tp - minid;
1647 if ( maxid == minid ) {
1648 U32 set = 0;
1649 for ( ; zp < tp ; zp++ ) {
1650 if ( ! trie->trans[ zp ].next ) {
1651 base = trie->uniquecharcount + zp - minid;
1652 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1653 trie->trans[ zp ].check = state;
1654 set = 1;
1655 break;
1656 }
1657 }
1658 if ( !set ) {
1659 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1660 trie->trans[ tp ].check = state;
1661 tp++;
1662 zp = tp;
1663 }
1664 } else {
1665 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1666 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1667 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1668 trie->trans[ tid ].check = state;
1669 }
1670 tp += ( maxid - minid + 1 );
1671 }
1672 Safefree(trie->states[ state ].trans.list);
1673 }
1674 /*
1675 DEBUG_TRIE_COMPILE_MORE_r(
1676 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1677 );
1678 */
1679 trie->states[ state ].trans.base=base;
1680 }
cc601c31 1681 trie->lasttrans = tp + 1;
a3621e74
YO
1682 }
1683 } else {
1684 /*
1685 Second Pass -- Flat Table Representation.
1686
1687 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1688 We know that we will need Charcount+1 trans at most to store the data
1689 (one row per char at worst case) So we preallocate both structures
1690 assuming worst case.
1691
1692 We then construct the trie using only the .next slots of the entry
1693 structs.
1694
1695 We use the .check field of the first entry of the node temporarily to
1696 make compression both faster and easier by keeping track of how many non
1697 zero fields are in the node.
1698
1699 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1700 transition.
1701
1702 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1703 number representing the first entry of the node, and state as a
1704 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1705 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1706 are 2 entrys per node. eg:
1707
1708 A B A B
1709 1. 2 4 1. 3 7
1710 2. 0 3 3. 0 5
1711 3. 0 0 5. 0 0
1712 4. 0 0 7. 0 0
1713
1714 The table is internally in the right hand, idx form. However as we also
1715 have to deal with the states array which is indexed by nodenum we have to
1716 use TRIE_NODENUM() to convert.
1717
1718 */
1e2e3d02
YO
1719 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1720 "%*sCompiling trie using table compiler\n",
1721 (int)depth * 2 + 2, ""));
3dab1dad 1722
c944940b
JH
1723 trie->trans = (reg_trie_trans *)
1724 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1725 * trie->uniquecharcount + 1,
1726 sizeof(reg_trie_trans) );
1727 trie->states = (reg_trie_state *)
1728 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1729 sizeof(reg_trie_state) );
a3621e74
YO
1730 next_alloc = trie->uniquecharcount + 1;
1731
3dab1dad 1732
a3621e74
YO
1733 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1734
c445ea15 1735 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1736 const U8 *uc = (U8*)STRING( noper );
1737 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1738
1739 U32 state = 1; /* required init */
1740
1741 U16 charid = 0; /* sanity init */
1742 U32 accept_state = 0; /* sanity init */
1743 U8 *scan = (U8*)NULL; /* sanity init */
1744
1745 STRLEN foldlen = 0; /* required init */
07be1b83 1746 U32 wordlen = 0; /* required init */
a3621e74
YO
1747 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1748
3dab1dad 1749 if ( OP(noper) != NOTHING ) {
786e8c11 1750 for ( ; uc < e ; uc += len ) {
a3621e74 1751
786e8c11 1752 TRIE_READ_CHAR;
a3621e74 1753
786e8c11
YO
1754 if ( uvc < 256 ) {
1755 charid = trie->charmap[ uvc ];
1756 } else {
55eed653 1757 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1758 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1759 }
786e8c11
YO
1760 if ( charid ) {
1761 charid--;
1762 if ( !trie->trans[ state + charid ].next ) {
1763 trie->trans[ state + charid ].next = next_alloc;
1764 trie->trans[ state ].check++;
1765 next_alloc += trie->uniquecharcount;
1766 }
1767 state = trie->trans[ state + charid ].next;
1768 } else {
1769 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1770 }
1771 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1772 }
a3621e74 1773 }
3dab1dad
YO
1774 accept_state = TRIE_NODENUM( state );
1775 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1776
1777 } /* end second pass */
1778
3dab1dad 1779 /* and now dump it out before we compress it */
2b8b4781
NC
1780 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1781 revcharmap,
1782 next_alloc, depth+1));
a3621e74 1783
a3621e74
YO
1784 {
1785 /*
1786 * Inplace compress the table.*
1787
1788 For sparse data sets the table constructed by the trie algorithm will
1789 be mostly 0/FAIL transitions or to put it another way mostly empty.
1790 (Note that leaf nodes will not contain any transitions.)
1791
1792 This algorithm compresses the tables by eliminating most such
1793 transitions, at the cost of a modest bit of extra work during lookup:
1794
1795 - Each states[] entry contains a .base field which indicates the
1796 index in the state[] array wheres its transition data is stored.
1797
1798 - If .base is 0 there are no valid transitions from that node.
1799
1800 - If .base is nonzero then charid is added to it to find an entry in
1801 the trans array.
1802
1803 -If trans[states[state].base+charid].check!=state then the
1804 transition is taken to be a 0/Fail transition. Thus if there are fail
1805 transitions at the front of the node then the .base offset will point
1806 somewhere inside the previous nodes data (or maybe even into a node
1807 even earlier), but the .check field determines if the transition is
1808 valid.
1809
786e8c11 1810 XXX - wrong maybe?
a3621e74
YO
1811 The following process inplace converts the table to the compressed
1812 table: We first do not compress the root node 1,and mark its all its
1813 .check pointers as 1 and set its .base pointer as 1 as well. This
1814 allows to do a DFA construction from the compressed table later, and
1815 ensures that any .base pointers we calculate later are greater than
1816 0.
1817
1818 - We set 'pos' to indicate the first entry of the second node.
1819
1820 - We then iterate over the columns of the node, finding the first and
1821 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1822 and set the .check pointers accordingly, and advance pos
1823 appropriately and repreat for the next node. Note that when we copy
1824 the next pointers we have to convert them from the original
1825 NODEIDX form to NODENUM form as the former is not valid post
1826 compression.
1827
1828 - If a node has no transitions used we mark its base as 0 and do not
1829 advance the pos pointer.
1830
1831 - If a node only has one transition we use a second pointer into the
1832 structure to fill in allocated fail transitions from other states.
1833 This pointer is independent of the main pointer and scans forward
1834 looking for null transitions that are allocated to a state. When it
1835 finds one it writes the single transition into the "hole". If the
786e8c11 1836 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1837
1838 - Once compressed we can Renew/realloc the structures to release the
1839 excess space.
1840
1841 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1842 specifically Fig 3.47 and the associated pseudocode.
1843
1844 demq
1845 */
a3b680e6 1846 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1847 U32 state, charid;
a3621e74 1848 U32 pos = 0, zp=0;
1e2e3d02 1849 trie->statecount = laststate;
a3621e74
YO
1850
1851 for ( state = 1 ; state < laststate ; state++ ) {
1852 U8 flag = 0;
a28509cc
AL
1853 const U32 stateidx = TRIE_NODEIDX( state );
1854 const U32 o_used = trie->trans[ stateidx ].check;
1855 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1856 trie->trans[ stateidx ].check = 0;
1857
1858 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1859 if ( flag || trie->trans[ stateidx + charid ].next ) {
1860 if ( trie->trans[ stateidx + charid ].next ) {
1861 if (o_used == 1) {
1862 for ( ; zp < pos ; zp++ ) {
1863 if ( ! trie->trans[ zp ].next ) {
1864 break;
1865 }
1866 }
1867 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1868 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1869 trie->trans[ zp ].check = state;
1870 if ( ++zp > pos ) pos = zp;
1871 break;
1872 }
1873 used--;
1874 }
1875 if ( !flag ) {
1876 flag = 1;
1877 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1878 }
1879 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ pos ].check = state;
1881 pos++;
1882 }
1883 }
1884 }
cc601c31 1885 trie->lasttrans = pos + 1;
c944940b
JH
1886 trie->states = (reg_trie_state *)
1887 PerlMemShared_realloc( trie->states, laststate
1888 * sizeof(reg_trie_state) );
a3621e74 1889 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1890 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1891 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1892 (int)depth * 2 + 2,"",
1893 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1894 (IV)next_alloc,
1895 (IV)pos,
a3621e74
YO
1896 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1897 );
1898
1899 } /* end table compress */
1900 }
1e2e3d02
YO
1901 DEBUG_TRIE_COMPILE_MORE_r(
1902 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1903 (int)depth * 2 + 2, "",
1904 (UV)trie->statecount,
1905 (UV)trie->lasttrans)
1906 );
cc601c31 1907 /* resize the trans array to remove unused space */
c944940b
JH
1908 trie->trans = (reg_trie_trans *)
1909 PerlMemShared_realloc( trie->trans, trie->lasttrans
1910 * sizeof(reg_trie_trans) );
a3621e74 1911
3dab1dad 1912 /* and now dump out the compressed format */
2b8b4781 1913 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1914
3dab1dad 1915 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1916 U8 nodetype =(U8)(flags & 0xFF);
1917 char *str=NULL;
786e8c11 1918
07be1b83 1919#ifdef DEBUGGING
e62cc96a 1920 regnode *optimize = NULL;
7122b237
YO
1921#ifdef RE_TRACK_PATTERN_OFFSETS
1922
b57a0404
JH
1923 U32 mjd_offset = 0;
1924 U32 mjd_nodelen = 0;
7122b237
YO
1925#endif /* RE_TRACK_PATTERN_OFFSETS */
1926#endif /* DEBUGGING */
a3621e74 1927 /*
3dab1dad
YO
1928 This means we convert either the first branch or the first Exact,
1929 depending on whether the thing following (in 'last') is a branch
1930 or not and whther first is the startbranch (ie is it a sub part of
1931 the alternation or is it the whole thing.)
1932 Assuming its a sub part we conver the EXACT otherwise we convert
1933 the whole branch sequence, including the first.
a3621e74 1934 */
3dab1dad 1935 /* Find the node we are going to overwrite */
7f69552c 1936 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1937 /* branch sub-chain */
3dab1dad 1938 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1939#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1940 DEBUG_r({
1941 mjd_offset= Node_Offset((convert));
1942 mjd_nodelen= Node_Length((convert));
1943 });
7122b237 1944#endif
7f69552c 1945 /* whole branch chain */
7122b237
YO
1946 }
1947#ifdef RE_TRACK_PATTERN_OFFSETS
1948 else {
7f69552c
YO
1949 DEBUG_r({
1950 const regnode *nop = NEXTOPER( convert );
1951 mjd_offset= Node_Offset((nop));
1952 mjd_nodelen= Node_Length((nop));
1953 });
07be1b83
YO
1954 }
1955 DEBUG_OPTIMISE_r(
1956 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1957 (int)depth * 2 + 2, "",
786e8c11 1958 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1959 );
7122b237 1960#endif
3dab1dad
YO
1961 /* But first we check to see if there is a common prefix we can
1962 split out as an EXACT and put in front of the TRIE node. */
1963 trie->startstate= 1;
55eed653 1964 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1965 U32 state;
1e2e3d02 1966 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1967 U32 ofs = 0;
8e11feef
RGS
1968 I32 idx = -1;
1969 U32 count = 0;
1970 const U32 base = trie->states[ state ].trans.base;
a3621e74 1971
3dab1dad 1972 if ( trie->states[state].wordnum )
8e11feef 1973 count = 1;
a3621e74 1974
8e11feef 1975 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1976 if ( ( base + ofs >= trie->uniquecharcount ) &&
1977 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1978 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1979 {
3dab1dad 1980 if ( ++count > 1 ) {
2b8b4781 1981 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1982 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1983 if ( state == 1 ) break;
3dab1dad
YO
1984 if ( count == 2 ) {
1985 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1986 DEBUG_OPTIMISE_r(
8e11feef
RGS
1987 PerlIO_printf(Perl_debug_log,
1988 "%*sNew Start State=%"UVuf" Class: [",
1989 (int)depth * 2 + 2, "",
786e8c11 1990 (UV)state));
be8e71aa 1991 if (idx >= 0) {
2b8b4781 1992 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1993 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1994
3dab1dad 1995 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1996 if ( folder )
1997 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1998 DEBUG_OPTIMISE_r(
f1f66076 1999 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2000 );
8e11feef
RGS
2001 }
2002 }
2003 TRIE_BITMAP_SET(trie,*ch);
2004 if ( folder )
2005 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2006 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2007 }
2008 idx = ofs;
2009 }
3dab1dad
YO
2010 }
2011 if ( count == 1 ) {
2b8b4781 2012 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2013 STRLEN len;
2014 char *ch = SvPV( *tmp, len );
de734bd5
A
2015 DEBUG_OPTIMISE_r({
2016 SV *sv=sv_newmortal();
8e11feef
RGS
2017 PerlIO_printf( Perl_debug_log,
2018 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2019 (int)depth * 2 + 2, "",
de734bd5
A
2020 (UV)state, (UV)idx,
2021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2022 PL_colors[0], PL_colors[1],
2023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2024 PERL_PV_ESCAPE_FIRSTCHAR
2025 )
2026 );
2027 });
3dab1dad
YO
2028 if ( state==1 ) {
2029 OP( convert ) = nodetype;
2030 str=STRING(convert);
2031 STR_LEN(convert)=0;
2032 }
c490c714
YO
2033 STR_LEN(convert) += len;
2034 while (len--)
de734bd5 2035 *str++ = *ch++;
8e11feef 2036 } else {
f9049ba1 2037#ifdef DEBUGGING
8e11feef
RGS
2038 if (state>1)
2039 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2040#endif
8e11feef
RGS
2041 break;
2042 }
2043 }
3dab1dad 2044 if (str) {
8e11feef 2045 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2046 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2047 trie->startstate = state;
07be1b83
YO
2048 trie->minlen -= (state - 1);
2049 trie->maxlen -= (state - 1);
33809eae
JH
2050#ifdef DEBUGGING
2051 /* At least the UNICOS C compiler choked on this
2052 * being argument to DEBUG_r(), so let's just have
2053 * it right here. */
2054 if (
2055#ifdef PERL_EXT_RE_BUILD
2056 1
2057#else
2058 DEBUG_r_TEST
2059#endif
2060 ) {
2061 regnode *fix = convert;
2062 U32 word = trie->wordcount;
2063 mjd_nodelen++;
2064 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2065 while( ++fix < n ) {
2066 Set_Node_Offset_Length(fix, 0, 0);
2067 }
2068 while (word--) {
2069 SV ** const tmp = av_fetch( trie_words, word, 0 );
2070 if (tmp) {
2071 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2072 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2073 else
2074 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2075 }
2076 }
2077 }
2078#endif
8e11feef
RGS
2079 if (trie->maxlen) {
2080 convert = n;
2081 } else {
3dab1dad 2082 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2083 DEBUG_r(optimize= n);
3dab1dad
YO
2084 }
2085 }
2086 }
a5ca303d
YO
2087 if (!jumper)
2088 jumper = last;
3dab1dad 2089 if ( trie->maxlen ) {
8e11feef
RGS
2090 NEXT_OFF( convert ) = (U16)(tail - convert);
2091 ARG_SET( convert, data_slot );
786e8c11
YO
2092 /* Store the offset to the first unabsorbed branch in
2093 jump[0], which is otherwise unused by the jump logic.
2094 We use this when dumping a trie and during optimisation. */
2095 if (trie->jump)
7f69552c 2096 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2097
786e8c11
YO
2098 /* XXXX */
2099 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2100 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2101 {
2102 OP( convert ) = TRIEC;
2103 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2104 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2105 trie->bitmap= NULL;
2106 } else
2107 OP( convert ) = TRIE;
a3621e74 2108
3dab1dad
YO
2109 /* store the type in the flags */
2110 convert->flags = nodetype;
a5ca303d
YO
2111 DEBUG_r({
2112 optimize = convert
2113 + NODE_STEP_REGNODE
2114 + regarglen[ OP( convert ) ];
2115 });
2116 /* XXX We really should free up the resource in trie now,
2117 as we won't use them - (which resources?) dmq */
3dab1dad 2118 }
a3621e74 2119 /* needed for dumping*/
e62cc96a 2120 DEBUG_r(if (optimize) {
07be1b83 2121 regnode *opt = convert;
bcdf7404 2122
e62cc96a 2123 while ( ++opt < optimize) {
07be1b83
YO
2124 Set_Node_Offset_Length(opt,0,0);
2125 }
786e8c11
YO
2126 /*
2127 Try to clean up some of the debris left after the
2128 optimisation.
a3621e74 2129 */
786e8c11 2130 while( optimize < jumper ) {
07be1b83 2131 mjd_nodelen += Node_Length((optimize));
a3621e74 2132 OP( optimize ) = OPTIMIZED;
07be1b83 2133 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2134 optimize++;
2135 }
07be1b83 2136 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2137 });
2138 } /* end node insert */
55eed653 2139 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2140#ifdef DEBUGGING
2141 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2142 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2143#else
2144 SvREFCNT_dec(revcharmap);
07be1b83 2145#endif
786e8c11
YO
2146 return trie->jump
2147 ? MADE_JUMP_TRIE
2148 : trie->startstate>1
2149 ? MADE_EXACT_TRIE
2150 : MADE_TRIE;
2151}
2152
2153STATIC void
2154S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2155{
2156/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2157
2158 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2159 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2160 ISBN 0-201-10088-6
2161
2162 We find the fail state for each state in the trie, this state is the longest proper
2163 suffix of the current states 'word' that is also a proper prefix of another word in our
2164 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2165 the DFA not to have to restart after its tried and failed a word at a given point, it
2166 simply continues as though it had been matching the other word in the first place.
2167 Consider
2168 'abcdgu'=~/abcdefg|cdgu/
2169 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2170 fail, which would bring use to the state representing 'd' in the second word where we would
2171 try 'g' and succeed, prodceding to match 'cdgu'.
2172 */
2173 /* add a fail transition */
3251b653
NC
2174 const U32 trie_offset = ARG(source);
2175 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2176 U32 *q;
2177 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2178 const U32 numstates = trie->statecount;
786e8c11
YO
2179 const U32 ubound = trie->lasttrans + ucharcount;
2180 U32 q_read = 0;
2181 U32 q_write = 0;
2182 U32 charid;
2183 U32 base = trie->states[ 1 ].trans.base;
2184 U32 *fail;
2185 reg_ac_data *aho;
2186 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2187 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2188
2189 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2190#ifndef DEBUGGING
2191 PERL_UNUSED_ARG(depth);
2192#endif
2193
2194
2195 ARG_SET( stclass, data_slot );
c944940b 2196 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2197 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2198 aho->trie=trie_offset;
446bd890
NC
2199 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2200 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2201 Newxz( q, numstates, U32);
c944940b 2202 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2203 aho->refcount = 1;
2204 fail = aho->fail;
2205 /* initialize fail[0..1] to be 1 so that we always have
2206 a valid final fail state */
2207 fail[ 0 ] = fail[ 1 ] = 1;
2208
2209 for ( charid = 0; charid < ucharcount ; charid++ ) {
2210 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2211 if ( newstate ) {
2212 q[ q_write ] = newstate;
2213 /* set to point at the root */
2214 fail[ q[ q_write++ ] ]=1;
2215 }
2216 }
2217 while ( q_read < q_write) {
2218 const U32 cur = q[ q_read++ % numstates ];
2219 base = trie->states[ cur ].trans.base;
2220
2221 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2222 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2223 if (ch_state) {
2224 U32 fail_state = cur;
2225 U32 fail_base;
2226 do {
2227 fail_state = fail[ fail_state ];
2228 fail_base = aho->states[ fail_state ].trans.base;
2229 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2230
2231 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2232 fail[ ch_state ] = fail_state;
2233 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2234 {
2235 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2236 }
2237 q[ q_write++ % numstates] = ch_state;
2238 }
2239 }
2240 }
2241 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2242 when we fail in state 1, this allows us to use the
2243 charclass scan to find a valid start char. This is based on the principle
2244 that theres a good chance the string being searched contains lots of stuff
2245 that cant be a start char.
2246 */
2247 fail[ 0 ] = fail[ 1 ] = 0;
2248 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2249 PerlIO_printf(Perl_debug_log,
2250 "%*sStclass Failtable (%"UVuf" states): 0",
2251 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2252 );
786e8c11
YO
2253 for( q_read=1; q_read<numstates; q_read++ ) {
2254 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2255 }
2256 PerlIO_printf(Perl_debug_log, "\n");
2257 });
2258 Safefree(q);
2259 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2260}
2261
786e8c11 2262
a3621e74 2263/*
5d1c421c
JH
2264 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2265 * These need to be revisited when a newer toolchain becomes available.
2266 */
2267#if defined(__sparc64__) && defined(__GNUC__)
2268# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2269# undef SPARC64_GCC_WORKAROUND
2270# define SPARC64_GCC_WORKAROUND 1
2271# endif
2272#endif
2273
07be1b83 2274#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2275 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2276 SV * const mysv=sv_newmortal(); \
2277 regnode *Next = regnext(scan); \
2278 regprop(RExC_rx, mysv, scan); \
7f69552c 2279 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2280 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2281 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2282 }});
07be1b83 2283
1de06328
YO
2284
2285
2286
2287
07be1b83
YO
2288#define JOIN_EXACT(scan,min,flags) \
2289 if (PL_regkind[OP(scan)] == EXACT) \
2290 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2291
be8e71aa 2292STATIC U32
07be1b83
YO
2293S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2294 /* Merge several consecutive EXACTish nodes into one. */
2295 regnode *n = regnext(scan);
2296 U32 stringok = 1;
2297 regnode *next = scan + NODE_SZ_STR(scan);
2298 U32 merged = 0;
2299 U32 stopnow = 0;
2300#ifdef DEBUGGING
2301 regnode *stop = scan;
72f13be8 2302 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2303#else
d47053eb
RGS
2304 PERL_UNUSED_ARG(depth);
2305#endif
7918f24d
NC
2306
2307 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2308#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2309 PERL_UNUSED_ARG(flags);
2310 PERL_UNUSED_ARG(val);
07be1b83 2311#endif
07be1b83
YO
2312 DEBUG_PEEP("join",scan,depth);
2313
2314 /* Skip NOTHING, merge EXACT*. */
2315 while (n &&
2316 ( PL_regkind[OP(n)] == NOTHING ||
2317 (stringok && (OP(n) == OP(scan))))
2318 && NEXT_OFF(n)
2319 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2320
2321 if (OP(n) == TAIL || n > next)
2322 stringok = 0;
2323 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2324 DEBUG_PEEP("skip:",n,depth);
2325 NEXT_OFF(scan) += NEXT_OFF(n);
2326 next = n + NODE_STEP_REGNODE;
2327#ifdef DEBUGGING
2328 if (stringok)
2329 stop = n;
2330#endif
2331 n = regnext(n);
2332 }
2333 else if (stringok) {
786e8c11 2334 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2335 regnode * const nnext = regnext(n);
2336
2337 DEBUG_PEEP("merg",n,depth);
2338
2339 merged++;
2340 if (oldl + STR_LEN(n) > U8_MAX)
2341 break;
2342 NEXT_OFF(scan) += NEXT_OFF(n);
2343 STR_LEN(scan) += STR_LEN(n);
2344 next = n + NODE_SZ_STR(n);
2345 /* Now we can overwrite *n : */
2346 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2347#ifdef DEBUGGING
2348 stop = next - 1;
2349#endif
2350 n = nnext;
2351 if (stopnow) break;
2352 }
2353
d47053eb
RGS
2354#ifdef EXPERIMENTAL_INPLACESCAN
2355 if (flags && !NEXT_OFF(n)) {
2356 DEBUG_PEEP("atch", val, depth);
2357 if (reg_off_by_arg[OP(n)]) {
2358 ARG_SET(n, val - n);
2359 }
2360 else {
2361 NEXT_OFF(n) = val - n;
2362 }
2363 stopnow = 1;
2364 }
07be1b83
YO
2365#endif
2366 }
2367
2368 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2369 /*
2370 Two problematic code points in Unicode casefolding of EXACT nodes:
2371
2372 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2373 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2374
2375 which casefold to
2376
2377 Unicode UTF-8
2378
2379 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2380 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2381
2382 This means that in case-insensitive matching (or "loose matching",
2383 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2384 length of the above casefolded versions) can match a target string
2385 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2386 This would rather mess up the minimum length computation.
2387
2388 What we'll do is to look for the tail four bytes, and then peek
2389 at the preceding two bytes to see whether we need to decrease
2390 the minimum length by four (six minus two).
2391
2392 Thanks to the design of UTF-8, there cannot be false matches:
2393 A sequence of valid UTF-8 bytes cannot be a subsequence of
2394 another valid sequence of UTF-8 bytes.
2395
2396 */
2397 char * const s0 = STRING(scan), *s, *t;
2398 char * const s1 = s0 + STR_LEN(scan) - 1;
2399 char * const s2 = s1 - 4;
e294cc5d
JH
2400#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2401 const char t0[] = "\xaf\x49\xaf\x42";
2402#else
07be1b83 2403 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2404#endif
07be1b83
YO
2405 const char * const t1 = t0 + 3;
2406
2407 for (s = s0 + 2;
2408 s < s2 && (t = ninstr(s, s1, t0, t1));
2409 s = t + 4) {
e294cc5d
JH
2410#ifdef EBCDIC
2411 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2412 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2413#else
07be1b83
YO
2414 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2415 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2416#endif
07be1b83
YO
2417 *min -= 4;
2418 }
2419 }
2420
2421#ifdef DEBUGGING
2422 /* Allow dumping */
2423 n = scan + NODE_SZ_STR(scan);
2424 while (n <= stop) {
2425 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2426 OP(n) = OPTIMIZED;
2427 NEXT_OFF(n) = 0;
2428 }
2429 n++;
2430 }
2431#endif
2432 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2433 return stopnow;
2434}
2435
653099ff
GS
2436/* REx optimizer. Converts nodes into quickier variants "in place".
2437 Finds fixed substrings. */
2438
a0288114 2439/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2440 to the position after last scanned or to NULL. */
2441
40d049e4
YO
2442#define INIT_AND_WITHP \
2443 assert(!and_withp); \
2444 Newx(and_withp,1,struct regnode_charclass_class); \
2445 SAVEFREEPV(and_withp)
07be1b83 2446
b515a41d
YO
2447/* this is a chain of data about sub patterns we are processing that
2448 need to be handled seperately/specially in study_chunk. Its so
2449 we can simulate recursion without losing state. */
2450struct scan_frame;
2451typedef struct scan_frame {
2452 regnode *last; /* last node to process in this frame */
2453 regnode *next; /* next node to process when last is reached */
2454 struct scan_frame *prev; /*previous frame*/
2455 I32 stop; /* what stopparen do we use */
2456} scan_frame;
2457
304ee84b
YO
2458
2459#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2460
e1d1eefb
YO
2461#define CASE_SYNST_FNC(nAmE) \
2462case nAmE: \
2463 if (flags & SCF_DO_STCLASS_AND) { \
2464 for (value = 0; value < 256; value++) \
2465 if (!is_ ## nAmE ## _cp(value)) \
2466 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2467 } \
2468 else { \
2469 for (value = 0; value < 256; value++) \
2470 if (is_ ## nAmE ## _cp(value)) \
2471 ANYOF_BITMAP_SET(data->start_class, value); \
2472 } \
2473 break; \
2474case N ## nAmE: \
2475 if (flags & SCF_DO_STCLASS_AND) { \
2476 for (value = 0; value < 256; value++) \
2477 if (is_ ## nAmE ## _cp(value)) \
2478 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2479 } \
2480 else { \
2481 for (value = 0; value < 256; value++) \
2482 if (!is_ ## nAmE ## _cp(value)) \
2483 ANYOF_BITMAP_SET(data->start_class, value); \
2484 } \
2485 break
2486
2487
2488
76e3520e 2489STATIC I32
40d049e4 2490S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2491 I32 *minlenp, I32 *deltap,
40d049e4
YO
2492 regnode *last,
2493 scan_data_t *data,
2494 I32 stopparen,
2495 U8* recursed,
2496 struct regnode_charclass_class *and_withp,
2497 U32 flags, U32 depth)
c277df42
IZ
2498 /* scanp: Start here (read-write). */
2499 /* deltap: Write maxlen-minlen here. */
2500 /* last: Stop before this one. */
40d049e4
YO
2501 /* data: string data about the pattern */
2502 /* stopparen: treat close N as END */
2503 /* recursed: which subroutines have we recursed into */
2504 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2505{
97aff369 2506 dVAR;
c277df42
IZ
2507 I32 min = 0, pars = 0, code;
2508 regnode *scan = *scanp, *next;
2509 I32 delta = 0;
2510 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2511 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2512 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2513 scan_data_t data_fake;
a3621e74 2514 SV *re_trie_maxbuff = NULL;
786e8c11 2515 regnode *first_non_open = scan;
e2e6a0f1 2516 I32 stopmin = I32_MAX;
8aa23a47 2517 scan_frame *frame = NULL;
a3621e74 2518 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2519
7918f24d
NC
2520 PERL_ARGS_ASSERT_STUDY_CHUNK;
2521
13a24bad 2522#ifdef DEBUGGING
40d049e4 2523 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2524#endif
40d049e4 2525
786e8c11 2526 if ( depth == 0 ) {
40d049e4 2527 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2528 first_non_open=regnext(first_non_open);
2529 }
2530
b81d288d 2531
8aa23a47
YO
2532 fake_study_recurse:
2533 while ( scan && OP(scan) != END && scan < last ){
2534 /* Peephole optimizer: */
304ee84b 2535 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2536 DEBUG_PEEP("Peep",scan,depth);
2537 JOIN_EXACT(scan,&min,0);
2538
2539 /* Follow the next-chain of the current node and optimize
2540 away all the NOTHINGs from it. */
2541 if (OP(scan) != CURLYX) {
2542 const int max = (reg_off_by_arg[OP(scan)]
2543 ? I32_MAX
2544 /* I32 may be smaller than U16 on CRAYs! */
2545 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2546 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2547 int noff;
2548 regnode *n = scan;
2549
2550 /* Skip NOTHING and LONGJMP. */
2551 while ((n = regnext(n))
2552 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2553 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2554 && off + noff < max)
2555 off += noff;
2556 if (reg_off_by_arg[OP(scan)])
2557 ARG(scan) = off;
2558 else
2559 NEXT_OFF(scan) = off;
2560 }
a3621e74 2561
c277df42 2562
8aa23a47
YO
2563
2564 /* The principal pseudo-switch. Cannot be a switch, since we
2565 look into several different things. */
2566 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2567 || OP(scan) == IFTHEN) {
2568 next = regnext(scan);
2569 code = OP(scan);
2570 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2571
2572 if (OP(next) == code || code == IFTHEN) {
2573 /* NOTE - There is similar code to this block below for handling
2574 TRIE nodes on a re-study. If you change stuff here check there
2575 too. */
2576 I32 max1 = 0, min1 = I32_MAX, num = 0;
2577 struct regnode_charclass_class accum;
2578 regnode * const startbranch=scan;
2579
2580 if (flags & SCF_DO_SUBSTR)
304ee84b 2581 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2582 if (flags & SCF_DO_STCLASS)
2583 cl_init_zero(pRExC_state, &accum);
2584
2585 while (OP(scan) == code) {
2586 I32 deltanext, minnext, f = 0, fake;
2587 struct regnode_charclass_class this_class;
2588
2589 num++;
2590 data_fake.flags = 0;
2591 if (data) {
2592 data_fake.whilem_c = data->whilem_c;
2593 data_fake.last_closep = data->last_closep;
2594 }
2595 else
2596 data_fake.last_closep = &fake;
58e23c8d
YO
2597
2598 data_fake.pos_delta = delta;
8aa23a47
YO
2599 next = regnext(scan);
2600 scan = NEXTOPER(scan);
2601 if (code != BRANCH)
c277df42 2602 scan = NEXTOPER(scan);
8aa23a47
YO
2603 if (flags & SCF_DO_STCLASS) {
2604 cl_init(pRExC_state, &this_class);
2605 data_fake.start_class = &this_class;
2606 f = SCF_DO_STCLASS_AND;
58e23c8d 2607 }
8aa23a47
YO
2608 if (flags & SCF_WHILEM_VISITED_POS)
2609 f |= SCF_WHILEM_VISITED_POS;
2610
2611 /* we suppose the run is continuous, last=next...*/
2612 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2613 next, &data_fake,
2614 stopparen, recursed, NULL, f,depth+1);
2615 if (min1 > minnext)
2616 min1 = minnext;
2617 if (max1 < minnext + deltanext)
2618 max1 = minnext + deltanext;
2619 if (deltanext == I32_MAX)
2620 is_inf = is_inf_internal = 1;
2621 scan = next;
2622 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2623 pars++;
2624 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2625 if ( stopmin > minnext)
2626 stopmin = min + min1;
2627 flags &= ~SCF_DO_SUBSTR;
2628 if (data)
2629 data->flags |= SCF_SEEN_ACCEPT;
2630 }
2631 if (data) {
2632 if (data_fake.flags & SF_HAS_EVAL)
2633 data->flags |= SF_HAS_EVAL;
2634 data->whilem_c = data_fake.whilem_c;
3dab1dad 2635 }
8aa23a47
YO
2636 if (flags & SCF_DO_STCLASS)
2637 cl_or(pRExC_state, &accum, &this_class);
2638 }
2639 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2640 min1 = 0;
2641 if (flags & SCF_DO_SUBSTR) {
2642 data->pos_min += min1;
2643 data->pos_delta += max1 - min1;
2644 if (max1 != min1 || is_inf)
2645 data->longest = &(data->longest_float);
2646 }
2647 min += min1;
2648 delta += max1 - min1;
2649 if (flags & SCF_DO_STCLASS_OR) {
2650 cl_or(pRExC_state, data->start_class, &accum);
2651 if (min1) {
2652 cl_and(data->start_class, and_withp);
2653 flags &= ~SCF_DO_STCLASS;
653099ff 2654 }
8aa23a47
YO
2655 }
2656 else if (flags & SCF_DO_STCLASS_AND) {
2657 if (min1) {
2658 cl_and(data->start_class, &accum);
2659 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2660 }
8aa23a47
YO
2661 else {
2662 /* Switch to OR mode: cache the old value of
2663 * data->start_class */
2664 INIT_AND_WITHP;
2665 StructCopy(data->start_class, and_withp,
2666 struct regnode_charclass_class);
2667 flags &= ~SCF_DO_STCLASS_AND;
2668 StructCopy(&accum, data->start_class,
2669 struct regnode_charclass_class);
2670 flags |= SCF_DO_STCLASS_OR;
2671 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2672 }
8aa23a47 2673 }
a3621e74 2674
8aa23a47
YO
2675 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2676 /* demq.
a3621e74 2677
8aa23a47
YO
2678 Assuming this was/is a branch we are dealing with: 'scan' now
2679 points at the item that follows the branch sequence, whatever
2680 it is. We now start at the beginning of the sequence and look
2681 for subsequences of
a3621e74 2682
8aa23a47
YO
2683 BRANCH->EXACT=>x1
2684 BRANCH->EXACT=>x2
2685 tail
a3621e74 2686
8aa23a47 2687 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2688
8aa23a47
YO
2689 If we can find such a subseqence we need to turn the first
2690 element into a trie and then add the subsequent branch exact
2691 strings to the trie.
a3621e74 2692
8aa23a47 2693 We have two cases
a3621e74 2694
8aa23a47 2695 1. patterns where the whole set of branch can be converted.
a3621e74 2696
8aa23a47 2697 2. patterns where only a subset can be converted.
a3621e74 2698
8aa23a47
YO
2699 In case 1 we can replace the whole set with a single regop
2700 for the trie. In case 2 we need to keep the start and end
2701 branchs so
a3621e74 2702
8aa23a47
YO
2703 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2704 becomes BRANCH TRIE; BRANCH X;
786e8c11 2705
8aa23a47
YO
2706 There is an additional case, that being where there is a
2707 common prefix, which gets split out into an EXACT like node
2708 preceding the TRIE node.
a3621e74 2709
8aa23a47
YO
2710 If x(1..n)==tail then we can do a simple trie, if not we make
2711 a "jump" trie, such that when we match the appropriate word
2712 we "jump" to the appopriate tail node. Essentailly we turn
2713 a nested if into a case structure of sorts.
b515a41d 2714
8aa23a47
YO
2715 */
2716
2717 int made=0;
2718 if (!re_trie_maxbuff) {
2719 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2720 if (!SvIOK(re_trie_maxbuff))
2721 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2722 }
2723 if ( SvIV(re_trie_maxbuff)>=0 ) {
2724 regnode *cur;
2725 regnode *first = (regnode *)NULL;
2726 regnode *last = (regnode *)NULL;
2727 regnode *tail = scan;
2728 U8 optype = 0;
2729 U32 count=0;
a3621e74
YO
2730
2731#ifdef DEBUGGING
8aa23a47 2732 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2733#endif
8aa23a47
YO
2734 /* var tail is used because there may be a TAIL
2735 regop in the way. Ie, the exacts will point to the
2736 thing following the TAIL, but the last branch will
2737 point at the TAIL. So we advance tail. If we
2738 have nested (?:) we may have to move through several
2739 tails.
2740 */
2741
2742 while ( OP( tail ) == TAIL ) {
2743 /* this is the TAIL generated by (?:) */
2744 tail = regnext( tail );
2745 }
a3621e74 2746
8aa23a47
YO
2747
2748 DEBUG_OPTIMISE_r({
2749 regprop(RExC_rx, mysv, tail );
2750 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2751 (int)depth * 2 + 2, "",
2752 "Looking for TRIE'able sequences. Tail node is: ",
2753 SvPV_nolen_const( mysv )
2754 );
2755 });
2756
2757 /*
2758
2759 step through the branches, cur represents each
2760 branch, noper is the first thing to be matched
2761 as part of that branch and noper_next is the
2762 regnext() of that node. if noper is an EXACT
2763 and noper_next is the same as scan (our current
2764 position in the regex) then the EXACT branch is
2765 a possible optimization target. Once we have
2766 two or more consequetive such branches we can
2767 create a trie of the EXACT's contents and stich
2768 it in place. If the sequence represents all of
2769 the branches we eliminate the whole thing and
2770 replace it with a single TRIE. If it is a
2771 subsequence then we need to stitch it in. This
2772 means the first branch has to remain, and needs
2773 to be repointed at the item on the branch chain
2774 following the last branch optimized. This could
2775 be either a BRANCH, in which case the
2776 subsequence is internal, or it could be the
2777 item following the branch sequence in which
2778 case the subsequence is at the end.
2779
2780 */
2781
2782 /* dont use tail as the end marker for this traverse */
2783 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2784 regnode * const noper = NEXTOPER( cur );
b515a41d 2785#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2786 regnode * const noper_next = regnext( noper );
b515a41d
YO
2787#endif
2788
8aa23a47
YO
2789 DEBUG_OPTIMISE_r({
2790 regprop(RExC_rx, mysv, cur);
2791 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2792 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2793
2794 regprop(RExC_rx, mysv, noper);
2795 PerlIO_printf( Perl_debug_log, " -> %s",
2796 SvPV_nolen_const(mysv));
2797
2798 if ( noper_next ) {
2799 regprop(RExC_rx, mysv, noper_next );
2800 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2801 SvPV_nolen_const(mysv));
2802 }
2803 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2804 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2805 });
2806 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2807 : PL_regkind[ OP( noper ) ] == EXACT )
2808 || OP(noper) == NOTHING )
786e8c11 2809#ifdef NOJUMPTRIE
8aa23a47 2810 && noper_next == tail
786e8c11 2811#endif
8aa23a47
YO
2812 && count < U16_MAX)
2813 {
2814 count++;
2815 if ( !first || optype == NOTHING ) {
2816 if (!first) first = cur;
2817 optype = OP( noper );
2818 } else {
2819 last = cur;
2820 }
2821 } else {
a0a388a1
YO
2822/*
2823 Currently we assume that the trie can handle unicode and ascii
2824 matches fold cased matches. If this proves true then the following
2825 define will prevent tries in this situation.
2826
2827 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2828*/
2829#define TRIE_TYPE_IS_SAFE 1
2830 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2831 make_trie( pRExC_state,
2832 startbranch, first, cur, tail, count,
2833 optype, depth+1 );
2834 }
2835 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2836#ifdef NOJUMPTRIE
8aa23a47 2837 && noper_next == tail
786e8c11 2838#endif
8aa23a47
YO
2839 ){
2840 count = 1;
2841 first = cur;
2842 optype = OP( noper );
2843 } else {
2844 count = 0;
2845 first = NULL;
2846 optype = 0;
2847 }
2848 last = NULL;
2849 }
2850 }
2851 DEBUG_OPTIMISE_r({
2852 regprop(RExC_rx, mysv, cur);
2853 PerlIO_printf( Perl_debug_log,
2854 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2855 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2856
2857 });
a0a388a1
YO
2858
2859 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2860 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2861#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2862 if ( ((made == MADE_EXACT_TRIE &&
2863 startbranch == first)
2864 || ( first_non_open == first )) &&
2865 depth==0 ) {
2866 flags |= SCF_TRIE_RESTUDY;
2867 if ( startbranch == first
2868 && scan == tail )
2869 {
2870 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2871 }
2872 }
3dab1dad 2873#endif
8aa23a47
YO
2874 }
2875 }
2876
2877 } /* do trie */
2878
653099ff 2879 }
8aa23a47
YO
2880 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2881 scan = NEXTOPER(NEXTOPER(scan));
2882 } else /* single branch is optimized. */
2883 scan = NEXTOPER(scan);
2884 continue;
2885 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2886 scan_frame *newframe = NULL;
2887 I32 paren;
2888 regnode *start;
2889 regnode *end;
2890
2891 if (OP(scan) != SUSPEND) {
2892 /* set the pointer */
2893 if (OP(scan) == GOSUB) {
2894 paren = ARG(scan);
2895 RExC_recurse[ARG2L(scan)] = scan;
2896 start = RExC_open_parens[paren-1];
2897 end = RExC_close_parens[paren-1];
2898 } else {
2899 paren = 0;
f8fc2ecf 2900 start = RExC_rxi->program + 1;
8aa23a47
YO
2901 end = RExC_opend;
2902 }
2903 if (!recursed) {
2904 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2905 SAVEFREEPV(recursed);
2906 }
2907 if (!PAREN_TEST(recursed,paren+1)) {
2908 PAREN_SET(recursed,paren+1);
2909 Newx(newframe,1,scan_frame);
2910 } else {
2911 if (flags & SCF_DO_SUBSTR) {
304ee84b 2912 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2913 data->longest = &(data->longest_float);
2914 }
2915 is_inf = is_inf_internal = 1;
2916 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2917 cl_anything(pRExC_state, data->start_class);
2918 flags &= ~SCF_DO_STCLASS;
2919 }
2920 } else {
2921 Newx(newframe,1,scan_frame);
2922 paren = stopparen;
2923 start = scan+2;
2924 end = regnext(scan);
2925 }
2926 if (newframe) {
2927 assert(start);
2928 assert(end);
2929 SAVEFREEPV(newframe);
2930 newframe->next = regnext(scan);
2931 newframe->last = last;
2932 newframe->stop = stopparen;
2933 newframe->prev = frame;
2934
2935 frame = newframe;
2936 scan = start;
2937 stopparen = paren;
2938 last = end;
2939
2940 continue;
2941 }
2942 }
2943 else if (OP(scan) == EXACT) {
2944 I32 l = STR_LEN(scan);
2945 UV uc;
2946 if (UTF) {
2947 const U8 * const s = (U8*)STRING(scan);
2948 l = utf8_length(s, s + l);
2949 uc = utf8_to_uvchr(s, NULL);
2950 } else {
2951 uc = *((U8*)STRING(scan));
2952 }
2953 min += l;
2954 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2955 /* The code below prefers earlier match for fixed
2956 offset, later match for variable offset. */
2957 if (data->last_end == -1) { /* Update the start info. */
2958 data->last_start_min = data->pos_min;
2959 data->last_start_max = is_inf
2960 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2961 }
8aa23a47
YO
2962 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2963 if (UTF)
2964 SvUTF8_on(data->last_found);
2965 {
2966 SV * const sv = data->last_found;
2967 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2968 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2969 if (mg && mg->mg_len >= 0)
2970 mg->mg_len += utf8_length((U8*)STRING(scan),
2971 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2972 }
8aa23a47
YO
2973 data->last_end = data->pos_min + l;
2974 data->pos_min += l; /* As in the first entry. */
2975 data->flags &= ~SF_BEFORE_EOL;
2976 }
2977 if (flags & SCF_DO_STCLASS_AND) {
2978 /* Check whether it is compatible with what we know already! */
2979 int compat = 1;
2980
2981 if (uc >= 0x100 ||
2982 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2983 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2984 && (!(data->start_class->flags & ANYOF_FOLD)
2985 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2986 )
2987 compat = 0;
2988 ANYOF_CLASS_ZERO(data->start_class);
2989 ANYOF_BITMAP_ZERO(data->start_class);
2990 if (compat)
2991 ANYOF_BITMAP_SET(data->start_class, uc);
2992 data->start_class->flags &= ~ANYOF_EOS;
2993 if (uc < 0x100)
2994 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2995 }
2996 else if (flags & SCF_DO_STCLASS_OR) {
2997 /* false positive possible if the class is case-folded */
2998 if (uc < 0x100)
2999 ANYOF_BITMAP_SET(data->start_class, uc);
3000 else
3001 data->start_class->flags |= ANYOF_UNICODE_ALL;
3002 data->start_class->flags &= ~ANYOF_EOS;
3003 cl_and(data->start_class, and_withp);
3004 }
3005 flags &= ~SCF_DO_STCLASS;
3006 }
3007 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3008 I32 l = STR_LEN(scan);
3009 UV uc = *((U8*)STRING(scan));
3010
3011 /* Search for fixed substrings supports EXACT only. */
3012 if (flags & SCF_DO_SUBSTR) {
3013 assert(data);
304ee84b 3014 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3015 }
3016 if (UTF) {
3017 const U8 * const s = (U8 *)STRING(scan);
3018 l = utf8_length(s, s + l);
3019 uc = utf8_to_uvchr(s, NULL);
3020 }
3021 min += l;
3022 if (flags & SCF_DO_SUBSTR)
3023 data->pos_min += l;
3024 if (flags & SCF_DO_STCLASS_AND) {
3025 /* Check whether it is compatible with what we know already! */
3026 int compat = 1;
3027
3028 if (uc >= 0x100 ||
3029 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3030 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3031 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3032 compat = 0;
3033 ANYOF_CLASS_ZERO(data->start_class);
3034 ANYOF_BITMAP_ZERO(data->start_class);
3035 if (compat) {
3036 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3037 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3038 data->start_class->flags |= ANYOF_FOLD;
3039 if (OP(scan) == EXACTFL)
3040 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3041 }
8aa23a47
YO
3042 }
3043 else if (flags & SCF_DO_STCLASS_OR) {
3044 if (data->start_class->flags & ANYOF_FOLD) {
3045 /* false positive possible if the class is case-folded.
3046 Assume that the locale settings are the same... */
1aa99e6b
IH
3047 if (uc < 0x100)
3048 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3049 data->start_class->flags &= ~ANYOF_EOS;
3050 }
8aa23a47 3051 cl_and(data->start_class, and_withp);
653099ff 3052 }
8aa23a47
YO
3053 flags &= ~SCF_DO_STCLASS;
3054 }
3055 else if (strchr((const char*)PL_varies,OP(scan))) {
3056 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3057 I32 f = flags, pos_before = 0;
3058 regnode * const oscan = scan;
3059 struct regnode_charclass_class this_class;
3060 struct regnode_charclass_class *oclass = NULL;
3061 I32 next_is_eval = 0;
3062
3063 switch (PL_regkind[OP(scan)]) {
3064 case WHILEM: /* End of (?:...)* . */
3065 scan = NEXTOPER(scan);
3066 goto finish;
3067 case PLUS:
3068 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3069 next = NEXTOPER(scan);
3070 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3071 mincount = 1;
3072 maxcount = REG_INFTY;
3073 next = regnext(scan);
3074 scan = NEXTOPER(scan);
3075 goto do_curly;
3076 }
3077 }
3078 if (flags & SCF_DO_SUBSTR)
3079 data->pos_min++;
3080 min++;
3081 /* Fall through. */
3082 case STAR:
3083 if (flags & SCF_DO_STCLASS) {
3084 mincount = 0;
3085 maxcount = REG_INFTY;
3086 next = regnext(scan);
3087 scan = NEXTOPER(scan);
3088 goto do_curly;
3089 }
3090 is_inf = is_inf_internal = 1;
3091 scan = regnext(scan);
c277df42 3092 if (flags & SCF_DO_SUBSTR) {
304ee84b 3093 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3094 data->longest = &(data->longest_float);
c277df42 3095 }
8aa23a47
YO
3096 goto optimize_curly_tail;
3097 case CURLY:
3098 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3099 && (scan->flags == stopparen))
3100 {
3101 mincount = 1;
3102 maxcount = 1;
3103 } else {
3104 mincount = ARG1(scan);
3105 maxcount = ARG2(scan);
653099ff 3106 }
8aa23a47
YO
3107 next = regnext(scan);
3108 if (OP(scan) == CURLYX) {
3109 I32 lp = (data ? *(data->last_closep) : 0);
3110 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3111 }
8aa23a47
YO
3112 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3113 next_is_eval = (OP(scan) == EVAL);
3114 do_curly:
3115 if (flags & SCF_DO_SUBSTR) {
304ee84b 3116 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3117 pos_before = data->pos_min;
b45f050a 3118 }
8aa23a47
YO
3119 if (data) {
3120 fl = data->flags;
3121 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3122 if (is_inf)
3123 data->flags |= SF_IS_INF;
3124 }
3125 if (flags & SCF_DO_STCLASS) {
3126 cl_init(pRExC_state, &this_class);
3127 oclass = data->start_class;
3128 data->start_class = &this_class;
3129 f |= SCF_DO_STCLASS_AND;
3130 f &= ~SCF_DO_STCLASS_OR;
3131 }
3132 /* These are the cases when once a subexpression
3133 fails at a particular position, it cannot succeed
3134 even after backtracking at the enclosing scope.
3135
3136 XXXX what if minimal match and we are at the
3137 initial run of {n,m}? */
3138 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3139 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3140
8aa23a47
YO
3141 /* This will finish on WHILEM, setting scan, or on NULL: */
3142 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3143 last, data, stopparen, recursed, NULL,
3144 (mincount == 0
3145 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3146
8aa23a47
YO
3147 if (flags & SCF_DO_STCLASS)
3148 data->start_class = oclass;
3149 if (mincount == 0 || minnext == 0) {
3150 if (flags & SCF_DO_STCLASS_OR) {
3151 cl_or(pRExC_state, data->start_class, &this_class);
3152 }
3153 else if (flags & SCF_DO_STCLASS_AND) {
3154 /* Switch to OR mode: cache the old value of
3155 * data->start_class */
3156 INIT_AND_WITHP;
3157 StructCopy(data->start_class, and_withp,
3158 struct regnode_charclass_class);
3159 flags &= ~SCF_DO_STCLASS_AND;
3160 StructCopy(&this_class, data->start_class,
3161 struct regnode_charclass_class);
3162 flags |= SCF_DO_STCLASS_OR;
3163 data->start_class->flags |= ANYOF_EOS;
3164 }
3165 } else { /* Non-zero len */
3166 if (flags & SCF_DO_STCLASS_OR) {
3167 cl_or(pRExC_state, data->start_class, &this_class);
3168 cl_and(data->start_class, and_withp);
3169 }
3170 else if (flags & SCF_DO_STCLASS_AND)
3171 cl_and(data->start_class, &this_class);
3172 flags &= ~SCF_DO_STCLASS;
3173 }
3174 if (!scan) /* It was not CURLYX, but CURLY. */
3175 scan = next;
3176 if ( /* ? quantifier ok, except for (?{ ... }) */
3177 (next_is_eval || !(mincount == 0 && maxcount == 1))
3178 && (minnext == 0) && (deltanext == 0)
3179 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3180 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3181 && ckWARN(WARN_REGEXP))
3182 {
3183 vWARN(RExC_parse,
3184 "Quantifier unexpected on zero-length expression");
3185 }
3186
3187 min += minnext * mincount;
3188 is_inf_internal |= ((maxcount == REG_INFTY
3189 && (minnext + deltanext) > 0)
3190 || deltanext == I32_MAX);
3191 is_inf |= is_inf_internal;
3192 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3193
3194 /* Try powerful optimization CURLYX => CURLYN. */
3195 if ( OP(oscan) == CURLYX && data
3196 && data->flags & SF_IN_PAR
3197 && !(data->flags & SF_HAS_EVAL)
3198 && !deltanext && minnext == 1 ) {
3199 /* Try to optimize to CURLYN. */
3200 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3201 regnode * const nxt1 = nxt;
497b47a8 3202#ifdef DEBUGGING
8aa23a47 3203 regnode *nxt2;
497b47a8 3204#endif
c277df42 3205
8aa23a47
YO
3206 /* Skip open. */
3207 nxt = regnext(nxt);
3208 if (!strchr((const char*)PL_simple,OP(nxt))
3209 && !(PL_regkind[OP(nxt)] == EXACT
3210 && STR_LEN(nxt) == 1))
3211 goto nogo;
497b47a8 3212#ifdef DEBUGGING
8aa23a47 3213 nxt2 = nxt;
497b47a8 3214#endif
8aa23a47
YO
3215 nxt = regnext(nxt);
3216 if (OP(nxt) != CLOSE)
3217 goto nogo;
3218 if (RExC_open_parens) {
3219 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3220 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3221 }
3222 /* Now we know that nxt2 is the only contents: */
3223 oscan->flags = (U8)ARG(nxt);
3224 OP(oscan) = CURLYN;
3225 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3226
c277df42 3227#ifdef DEBUGGING
8aa23a47
YO
3228 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3229 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3230 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3231 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3232 OP(nxt + 1) = OPTIMIZED; /* was count. */
3233 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3234#endif
8aa23a47
YO
3235 }
3236 nogo:
3237
3238 /* Try optimization CURLYX => CURLYM. */
3239 if ( OP(oscan) == CURLYX && data
3240 && !(data->flags & SF_HAS_PAR)
3241 && !(data->flags & SF_HAS_EVAL)
3242 && !deltanext /* atom is fixed width */
3243 && minnext != 0 /* CURLYM can't handle zero width */
3244 ) {
3245 /* XXXX How to optimize if data == 0? */
3246 /* Optimize to a simpler form. */
3247 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3248 regnode *nxt2;
3249
3250 OP(oscan) = CURLYM;
3251 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3252 && (OP(nxt2) != WHILEM))
3253 nxt = nxt2;
3254 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3255 /* Need to optimize away parenths. */
3256 if (data->flags & SF_IN_PAR) {
3257 /* Set the parenth number. */
3258 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3259
3260 if (OP(nxt) != CLOSE)
3261 FAIL("Panic opt close");
3262 oscan->flags = (U8)ARG(nxt);
3263 if (RExC_open_parens) {
3264 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3265 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3266 }
8aa23a47
YO
3267 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3268 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3269
c277df42 3270#ifdef DEBUGGING
8aa23a47
YO
3271 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3272 OP(nxt + 1) = OPTIMIZED; /* was count. */
3273 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3274 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3275#endif
c277df42 3276#if 0
8aa23a47
YO
3277 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3278 regnode *nnxt = regnext(nxt1);
3279
3280 if (nnxt == nxt) {
3281 if (reg_off_by_arg[OP(nxt1)])
3282 ARG_SET(nxt1, nxt2 - nxt1);
3283 else if (nxt2 - nxt1 < U16_MAX)
3284 NEXT_OFF(nxt1) = nxt2 - nxt1;
3285 else
3286 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3287 }
8aa23a47 3288 nxt1 = nnxt;
c277df42 3289 }
5d1c421c 3290#endif
8aa23a47
YO
3291 /* Optimize again: */
3292 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3293 NULL, stopparen, recursed, NULL, 0,depth+1);
3294 }
3295 else
3296 oscan->flags = 0;
3297 }
3298 else if ((OP(oscan) == CURLYX)
3299 && (flags & SCF_WHILEM_VISITED_POS)
3300 /* See the comment on a similar expression above.
3301 However, this time it not a subexpression
3302 we care about, but the expression itself. */
3303 && (maxcount == REG_INFTY)
3304 && data && ++data->whilem_c < 16) {
3305 /* This stays as CURLYX, we can put the count/of pair. */
3306 /* Find WHILEM (as in regexec.c) */
3307 regnode *nxt = oscan + NEXT_OFF(oscan);
3308
3309 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3310 nxt += ARG(nxt);
3311 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3312 | (RExC_whilem_seen << 4)); /* On WHILEM */
3313 }
3314 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3315 pars++;
3316 if (flags & SCF_DO_SUBSTR) {
3317 SV *last_str = NULL;
3318 int counted = mincount != 0;
a0ed51b3 3319
8aa23a47
YO
3320 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3321#if defined(SPARC64_GCC_WORKAROUND)
3322 I32 b = 0;
3323 STRLEN l = 0;
3324 const char *s = NULL;
3325 I32 old = 0;
b515a41d 3326
8aa23a47
YO
3327 if (pos_before >= data->last_start_min)
3328 b = pos_before;
3329 else
3330 b = data->last_start_min;
b515a41d 3331
8aa23a47
YO
3332 l = 0;
3333 s = SvPV_const(data->last_found, l);
3334 old = b - data->last_start_min;
3335
3336#else
3337 I32 b = pos_before >= data->last_start_min
3338 ? pos_before : data->last_start_min;
3339 STRLEN l;
3340 const char * const s = SvPV_const(data->last_found, l);
3341 I32 old = b - data->last_start_min;
3342#endif
3343
3344 if (UTF)
3345 old = utf8_hop((U8*)s, old) - (U8*)s;
3346
3347 l -= old;
3348 /* Get the added string: */
740cce10 3349 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3350 if (deltanext == 0 && pos_before == b) {
3351 /* What was added is a constant string */
3352 if (mincount > 1) {
3353 SvGROW(last_str, (mincount * l) + 1);
3354 repeatcpy(SvPVX(last_str) + l,
3355 SvPVX_const(last_str), l, mincount - 1);
3356 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3357 /* Add additional parts. */
3358 SvCUR_set(data->last_found,
3359 SvCUR(data->last_found) - l);
3360 sv_catsv(data->last_found, last_str);
3361 {
3362 SV * sv = data->last_found;
3363 MAGIC *mg =
3364 SvUTF8(sv) && SvMAGICAL(sv) ?
3365 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3366 if (mg && mg->mg_len >= 0)
bd94e887 3367 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3368 }
8aa23a47 3369 data->last_end += l * (mincount - 1);
b515a41d 3370 }
8aa23a47
YO
3371 } else {
3372 /* start offset must point into the last copy */
3373 data->last_start_min += minnext * (mincount - 1);
3374 data->last_start_max += is_inf ? I32_MAX
3375 : (maxcount - 1) * (minnext + data->pos_delta);
3376 }
c277df42 3377 }
8aa23a47
YO
3378 /* It is counted once already... */
3379 data->pos_min += minnext * (mincount - counted);
3380 data->pos_delta += - counted * deltanext +
3381 (minnext + deltanext) * maxcount - minnext * mincount;
3382 if (mincount != maxcount) {
3383 /* Cannot extend fixed substrings found inside
3384 the group. */
304ee84b 3385 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3386 if (mincount && last_str) {
3387 SV * const sv = data->last_found;
3388 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3389 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3390
3391 if (mg)
3392 mg->mg_len = -1;
3393 sv_setsv(sv, last_str);
3394 data->last_end = data->pos_min;
3395 data->last_start_min =
3396 data->pos_min - CHR_SVLEN(last_str);
3397 data->last_start_max = is_inf
3398 ? I32_MAX
3399 : data->pos_min + data->pos_delta
3400 - CHR_SVLEN(last_str);
3401 }
3402 data->longest = &(data->longest_float);
3403 }
3404 SvREFCNT_dec(last_str);
c277df42 3405 }
8aa23a47
YO
3406 if (data && (fl & SF_HAS_EVAL))
3407 data->flags |= SF_HAS_EVAL;
3408 optimize_curly_tail:
3409 if (OP(oscan) != CURLYX) {
3410 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3411 && NEXT_OFF(next))
3412 NEXT_OFF(oscan) += NEXT_OFF(next);
3413 }
3414 continue;
3415 default: /* REF and CLUMP only? */
3416 if (flags & SCF_DO_SUBSTR) {
304ee84b 3417 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3418 data->longest = &(data->longest_float);
3419 }
3420 is_inf = is_inf_internal = 1;
3421 if (flags & SCF_DO_STCLASS_OR)
3422 cl_anything(pRExC_state, data->start_class);
3423 flags &= ~SCF_DO_STCLASS;
3424 break;
c277df42 3425 }
8aa23a47 3426 }
e1d1eefb
YO
3427 else if (OP(scan) == LNBREAK) {
3428 if (flags & SCF_DO_STCLASS) {
3429 int value = 0;
3430 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3431 if (flags & SCF_DO_STCLASS_AND) {
3432 for (value = 0; value < 256; value++)
e64b1bd1 3433 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3434 ANYOF_BITMAP_CLEAR(data->start_class, value);
3435 }
3436 else {
3437 for (value = 0; value < 256; value++)
e64b1bd1 3438 if (is_VERTWS_cp(value))
e1d1eefb
YO
3439 ANYOF_BITMAP_SET(data->start_class, value);
3440 }
3441 if (flags & SCF_DO_STCLASS_OR)
3442 cl_and(data->start_class, and_withp);
3443 flags &= ~SCF_DO_STCLASS;
3444 }
3445 min += 1;
f9a79580 3446 delta += 1;
e1d1eefb
YO
3447 if (flags & SCF_DO_SUBSTR) {
3448 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3449 data->pos_min += 1;
f9a79580 3450 data->pos_delta += 1;
e1d1eefb
YO
3451 data->longest = &(data->longest_float);
3452 }
3453
3454 }
f9a79580
RGS
3455 else if (OP(scan) == FOLDCHAR) {
3456 int d = ARG(scan)==0xDF ? 1 : 2;
3457 flags &= ~SCF_DO_STCLASS;
3458 min += 1;
3459 delta += d;
3460 if (flags & SCF_DO_SUBSTR) {
3461 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3462 data->pos_min += 1;
3463 data->pos_delta += d;
3464 data->longest = &(data->longest_float);
3465 }
3466 }
8aa23a47
YO
3467 else if (strchr((const char*)PL_simple,OP(scan))) {
3468 int value = 0;
653099ff 3469
8aa23a47 3470 if (flags & SCF_DO_SUBSTR) {
304ee84b 3471 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3472 data->pos_min++;
3473 }
3474 min++;
3475 if (flags & SCF_DO_STCLASS) {
3476 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3477
8aa23a47
YO
3478 /* Some of the logic below assumes that switching
3479 locale on will only add false positives. */
3480 switch (PL_regkind[OP(scan)]) {
3481 case SANY:
3482 default:
3483 do_default:
3484 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3485 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3486 cl_anything(pRExC_state, data->start_class);
3487 break;
3488 case REG_ANY:
3489 if (OP(scan) == SANY)
3490 goto do_default;
3491 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3492 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3493 || (data->start_class->flags & ANYOF_CLASS));
3494 cl_anything(pRExC_state, data->start_class);
653099ff 3495 }
8aa23a47
YO
3496 if (flags & SCF_DO_STCLASS_AND || !value)
3497 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3498 break;
3499 case ANYOF:
3500 if (flags & SCF_DO_STCLASS_AND)
3501 cl_and(data->start_class,
3502 (struct regnode_charclass_class*)scan);
653099ff 3503 else
8aa23a47
YO
3504 cl_or(pRExC_state, data->start_class,
3505 (struct regnode_charclass_class*)scan);
3506 break;
3507 case ALNUM:
3508 if (flags & SCF_DO_STCLASS_AND) {
3509 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3510 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3511 for (value = 0; value < 256; value++)
3512 if (!isALNUM(value))
3513 ANYOF_BITMAP_CLEAR(data->start_class, value);
3514 }
653099ff 3515 }
8aa23a47
YO
3516 else {
3517 if (data->start_class->flags & ANYOF_LOCALE)
3518 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3519 else {
3520 for (value = 0; value < 256; value++)
3521 if (isALNUM(value))
3522 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3523 }
8aa23a47
YO
3524 }
3525 break;
3526 case ALNUML:
3527 if (flags & SCF_DO_STCLASS_AND) {
3528 if (data->start_class->flags & ANYOF_LOCALE)
3529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3530 }
3531 else {
3532 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3533 data->start_class->flags |= ANYOF_LOCALE;
3534 }
3535 break;
3536 case NALNUM:
3537 if (flags & SCF_DO_STCLASS_AND) {
3538 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3539 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3540 for (value = 0; value < 256; value++)
3541 if (isALNUM(value))
3542 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3543 }
3544 }
8aa23a47
YO
3545 else {
3546 if (data->start_class->flags & ANYOF_LOCALE)
3547 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3548 else {
3549 for (value = 0; value < 256; value++)
3550 if (!isALNUM(value))
3551 ANYOF_BITMAP_SET(data->start_class, value);
3552 }
653099ff 3553 }
8aa23a47
YO
3554 break;
3555 case NALNUML:
3556 if (flags & SCF_DO_STCLASS_AND) {
3557 if (data->start_class->flags & ANYOF_LOCALE)
3558 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3559 }
8aa23a47
YO
3560 else {
3561 data->start_class->flags |= ANYOF_LOCALE;
3562 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3563 }
3564 break;
3565 case SPACE:
3566 if (flags & SCF_DO_STCLASS_AND) {
3567 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3568 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3569 for (value = 0; value < 256; value++)
3570 if (!isSPACE(value))
3571 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3572 }
3573 }
8aa23a47
YO
3574 else {
3575 if (data->start_class->flags & ANYOF_LOCALE)
3576 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3577 else {
3578 for (value = 0; value < 256; value++)
3579 if (isSPACE(value))
3580 ANYOF_BITMAP_SET(data->start_class, value);
3581 }
653099ff 3582 }
8aa23a47
YO
3583 break;
3584 case SPACEL:
3585 if (flags & SCF_DO_STCLASS_AND) {
3586 if (data->start_class->flags & ANYOF_LOCALE)
3587 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3588 }
3589 else {
3590 data->start_class->flags |= ANYOF_LOCALE;
3591 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3592 }
3593 break;
3594 case NSPACE:
3595 if (flags & SCF_DO_STCLASS_AND) {
3596 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3597 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3598 for (value = 0; value < 256; value++)
3599 if (isSPACE(value))
3600 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3601 }
8aa23a47
YO
3602 }
3603 else {
3604 if (data->start_class->flags & ANYOF_LOCALE)
3605 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3606 else {
3607 for (value = 0; value < 256; value++)
3608 if (!isSPACE(value))
3609 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3610 }
3611 }
8aa23a47
YO
3612 break;
3613 case NSPACEL:
3614 if (flags & SCF_DO_STCLASS_AND) {
3615 if (data->start_class->flags & ANYOF_LOCALE) {
3616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3617 for (value = 0; value < 256; value++)
3618 if (!isSPACE(value))
3619 ANYOF_BITMAP_CLEAR(data->start_class, value);
3620 }
653099ff 3621 }
8aa23a47
YO
3622 else {
3623 data->start_class->flags |= ANYOF_LOCALE;
3624 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3625 }
3626 break;
3627 case DIGIT:
3628 if (flags & SCF_DO_STCLASS_AND) {
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3630 for (value = 0; value < 256; value++)
3631 if (!isDIGIT(value))
3632 ANYOF_BITMAP_CLEAR(data->start_class, value);
3633 }
3634 else {
3635 if (data->start_class->flags & ANYOF_LOCALE)
3636 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3637 else {
3638 for (value = 0; value < 256; value++)
3639 if (isDIGIT(value))
3640 ANYOF_BITMAP_SET(data->start_class, value);
3641 }
3642 }
3643 break;
3644 case NDIGIT:
3645 if (flags & SCF_DO_STCLASS_AND) {
3646 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3647 for (value = 0; value < 256; value++)
3648 if (isDIGIT(value))
3649 ANYOF_BITMAP_CLEAR(data->start_class, value);
3650 }
3651 else {
3652 if (data->start_class->flags & ANYOF_LOCALE)
3653 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3654 else {
3655 for (value = 0; value < 256; value++)
3656 if (!isDIGIT(value))
3657 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3658 }
3659 }
8aa23a47 3660 break;
e1d1eefb
YO
3661 CASE_SYNST_FNC(VERTWS);
3662 CASE_SYNST_FNC(HORIZWS);
3663
8aa23a47
YO
3664 }
3665 if (flags & SCF_DO_STCLASS_OR)
3666 cl_and(data->start_class, and_withp);
3667 flags &= ~SCF_DO_STCLASS;
3668 }
3669 }
3670 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3671 data->flags |= (OP(scan) == MEOL
3672 ? SF_BEFORE_MEOL
3673 : SF_BEFORE_SEOL);
3674 }
3675 else if ( PL_regkind[OP(scan)] == BRANCHJ
3676 /* Lookbehind, or need to calculate parens/evals/stclass: */
3677 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3678 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3679 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3680 || OP(scan) == UNLESSM )
3681 {
3682 /* Negative Lookahead/lookbehind
3683 In this case we can't do fixed string optimisation.
3684 */
1de06328 3685
8aa23a47
YO
3686 I32 deltanext, minnext, fake = 0;
3687 regnode *nscan;
3688 struct regnode_charclass_class intrnl;
3689 int f = 0;
1de06328 3690
8aa23a47
YO
3691 data_fake.flags = 0;
3692 if (data) {
3693 data_fake.whilem_c = data->whilem_c;
3694 data_fake.last_closep = data->last_closep;
c277df42 3695 }
8aa23a47
YO
3696 else
3697 data_fake.last_closep = &fake;
58e23c8d 3698 data_fake.pos_delta = delta;
8aa23a47
YO
3699 if ( flags & SCF_DO_STCLASS && !scan->flags
3700 && OP(scan) == IFMATCH ) { /* Lookahead */
3701 cl_init(pRExC_state, &intrnl);
3702 data_fake.start_class = &intrnl;
3703 f |= SCF_DO_STCLASS_AND;
3704 }
3705 if (flags & SCF_WHILEM_VISITED_POS)
3706 f |= SCF_WHILEM_VISITED_POS;
3707 next = regnext(scan);
3708 nscan = NEXTOPER(NEXTOPER(scan));
3709 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3710 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3711 if (scan->flags) {
3712 if (deltanext) {
58e23c8d 3713 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3714 }
3715 else if (minnext > (I32)U8_MAX) {
58e23c8d 3716 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3717 }
3718 scan->flags = (U8)minnext;
3719 }
3720 if (data) {
3721 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3722 pars++;
3723 if (data_fake.flags & SF_HAS_EVAL)
3724 data->flags |= SF_HAS_EVAL;
3725 data->whilem_c = data_fake.whilem_c;
3726 }
3727 if (f & SCF_DO_STCLASS_AND) {
3728 const int was = (data->start_class->flags & ANYOF_EOS);
3729
3730 cl_and(data->start_class, &intrnl);
3731 if (was)
3732 data->start_class->flags |= ANYOF_EOS;
3733 }
cb434fcc 3734 }
8aa23a47
YO
3735#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3736 else {
3737 /* Positive Lookahead/lookbehind
3738 In this case we can do fixed string optimisation,
3739 but we must be careful about it. Note in the case of
3740 lookbehind the positions will be offset by the minimum
3741 length of the pattern, something we won't know about
3742 until after the recurse.
3743 */
3744 I32 deltanext, fake = 0;
3745 regnode *nscan;
3746 struct regnode_charclass_class intrnl;
3747 int f = 0;
3748 /* We use SAVEFREEPV so that when the full compile
3749 is finished perl will clean up the allocated
3750 minlens when its all done. This was we don't
3751 have to worry about freeing them when we know
3752 they wont be used, which would be a pain.
3753 */
3754 I32 *minnextp;
3755 Newx( minnextp, 1, I32 );
3756 SAVEFREEPV(minnextp);
3757
3758 if (data) {
3759 StructCopy(data, &data_fake, scan_data_t);
3760 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3761 f |= SCF_DO_SUBSTR;
3762 if (scan->flags)
304ee84b 3763 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3764 data_fake.last_found=newSVsv(data->last_found);
3765 }
3766 }
3767 else
3768 data_fake.last_closep = &fake;
3769 data_fake.flags = 0;
58e23c8d 3770 data_fake.pos_delta = delta;
8aa23a47
YO
3771 if (is_inf)
3772 data_fake.flags |= SF_IS_INF;
3773 if ( flags & SCF_DO_STCLASS && !scan->flags
3774 && OP(scan) == IFMATCH ) { /* Lookahead */
3775 cl_init(pRExC_state, &intrnl);
3776 data_fake.start_class = &intrnl;
3777 f |= SCF_DO_STCLASS_AND;
3778 }
3779 if (flags & SCF_WHILEM_VISITED_POS)
3780 f |= SCF_WHILEM_VISITED_POS;
3781 next = regnext(scan);
3782 nscan = NEXTOPER(NEXTOPER(scan));
3783
3784 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3785 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3786 if (scan->flags) {
3787 if (deltanext) {
58e23c8d 3788 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3789 }
3790 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3791 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3792 }
3793 scan->flags = (U8)*minnextp;
3794 }
3795
3796 *minnextp += min;