This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move widecharmap out of the shared structure _reg_trie_data into the
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
c74340f9 159#define RExC_cpar (pRExC_state->cpar)
e2e6a0f1 160#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 165#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
166#define RExC_open_parens (pRExC_state->open_parens)
167#define RExC_close_parens (pRExC_state->close_parens)
168#define RExC_opend (pRExC_state->opend)
81714fb9 169#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
170#define RExC_recurse (pRExC_state->recurse)
171#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 172
a687059c
LW
173#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
174#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
175 ((*s) == '{' && regcurly(s)))
a687059c 176
35c8bce7
LW
177#ifdef SPSTART
178#undef SPSTART /* dratted cpp namespace... */
179#endif
a687059c
LW
180/*
181 * Flags to be passed up and down.
182 */
a687059c 183#define WORST 0 /* Worst case. */
821b33a5 184#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
185#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
186#define SPSTART 0x4 /* Starts with * or +. */
187#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 188
3dab1dad
YO
189#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
190
07be1b83
YO
191/* whether trie related optimizations are enabled */
192#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
193#define TRIE_STUDY_OPT
786e8c11 194#define FULL_TRIE_STUDY
07be1b83
YO
195#define TRIE_STCLASS
196#endif
1de06328
YO
197
198
40d049e4
YO
199
200#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
201#define PBITVAL(paren) (1 << ((paren) & 7))
202#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
203#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
204#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
205
206
1de06328
YO
207/* About scan_data_t.
208
209 During optimisation we recurse through the regexp program performing
210 various inplace (keyhole style) optimisations. In addition study_chunk
211 and scan_commit populate this data structure with information about
212 what strings MUST appear in the pattern. We look for the longest
213 string that must appear for at a fixed location, and we look for the
214 longest string that may appear at a floating location. So for instance
215 in the pattern:
216
217 /FOO[xX]A.*B[xX]BAR/
218
219 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
220 strings (because they follow a .* construct). study_chunk will identify
221 both FOO and BAR as being the longest fixed and floating strings respectively.
222
223 The strings can be composites, for instance
224
225 /(f)(o)(o)/
226
227 will result in a composite fixed substring 'foo'.
228
229 For each string some basic information is maintained:
230
231 - offset or min_offset
232 This is the position the string must appear at, or not before.
233 It also implicitly (when combined with minlenp) tells us how many
234 character must match before the string we are searching.
235 Likewise when combined with minlenp and the length of the string
236 tells us how many characters must appear after the string we have
237 found.
238
239 - max_offset
240 Only used for floating strings. This is the rightmost point that
241 the string can appear at. Ifset to I32 max it indicates that the
242 string can occur infinitely far to the right.
243
244 - minlenp
245 A pointer to the minimum length of the pattern that the string
246 was found inside. This is important as in the case of positive
247 lookahead or positive lookbehind we can have multiple patterns
248 involved. Consider
249
250 /(?=FOO).*F/
251
252 The minimum length of the pattern overall is 3, the minimum length
253 of the lookahead part is 3, but the minimum length of the part that
254 will actually match is 1. So 'FOO's minimum length is 3, but the
255 minimum length for the F is 1. This is important as the minimum length
256 is used to determine offsets in front of and behind the string being
257 looked for. Since strings can be composites this is the length of the
258 pattern at the time it was commited with a scan_commit. Note that
259 the length is calculated by study_chunk, so that the minimum lengths
260 are not known until the full pattern has been compiled, thus the
261 pointer to the value.
262
263 - lookbehind
264
265 In the case of lookbehind the string being searched for can be
266 offset past the start point of the final matching string.
267 If this value was just blithely removed from the min_offset it would
268 invalidate some of the calculations for how many chars must match
269 before or after (as they are derived from min_offset and minlen and
270 the length of the string being searched for).
271 When the final pattern is compiled and the data is moved from the
272 scan_data_t structure into the regexp structure the information
273 about lookbehind is factored in, with the information that would
274 have been lost precalculated in the end_shift field for the
275 associated string.
276
277 The fields pos_min and pos_delta are used to store the minimum offset
278 and the delta to the maximum offset at the current point in the pattern.
279
280*/
2c2d71f5
JH
281
282typedef struct scan_data_t {
1de06328
YO
283 /*I32 len_min; unused */
284 /*I32 len_delta; unused */
2c2d71f5
JH
285 I32 pos_min;
286 I32 pos_delta;
287 SV *last_found;
1de06328 288 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
289 I32 last_start_min;
290 I32 last_start_max;
1de06328
YO
291 SV **longest; /* Either &l_fixed, or &l_float. */
292 SV *longest_fixed; /* longest fixed string found in pattern */
293 I32 offset_fixed; /* offset where it starts */
294 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
295 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
296 SV *longest_float; /* longest floating string found in pattern */
297 I32 offset_float_min; /* earliest point in string it can appear */
298 I32 offset_float_max; /* latest point in string it can appear */
299 I32 *minlen_float; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
301 I32 flags;
302 I32 whilem_c;
cb434fcc 303 I32 *last_closep;
653099ff 304 struct regnode_charclass_class *start_class;
2c2d71f5
JH
305} scan_data_t;
306
a687059c 307/*
e50aee73 308 * Forward declarations for pregcomp()'s friends.
a687059c 309 */
a0d0e21e 310
27da23d5 311static const scan_data_t zero_scan_data =
1de06328 312 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
313
314#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
315#define SF_BEFORE_SEOL 0x0001
316#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
317#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
318#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
319
09b7f37c
CB
320#ifdef NO_UNARY_PLUS
321# define SF_FIX_SHIFT_EOL (0+2)
322# define SF_FL_SHIFT_EOL (0+4)
323#else
324# define SF_FIX_SHIFT_EOL (+2)
325# define SF_FL_SHIFT_EOL (+4)
326#endif
c277df42
IZ
327
328#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
329#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
330
331#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
332#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
333#define SF_IS_INF 0x0040
334#define SF_HAS_PAR 0x0080
335#define SF_IN_PAR 0x0100
336#define SF_HAS_EVAL 0x0200
337#define SCF_DO_SUBSTR 0x0400
653099ff
GS
338#define SCF_DO_STCLASS_AND 0x0800
339#define SCF_DO_STCLASS_OR 0x1000
340#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 341#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 342
786e8c11 343#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 344#define SCF_SEEN_ACCEPT 0x8000
07be1b83 345
eb160463 346#define UTF (RExC_utf8 != 0)
bbe252da
YO
347#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
348#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 349
ffc61ed2 350#define OOB_UNICODE 12345678
93733859 351#define OOB_NAMEDCLASS -1
b8c5462f 352
a0ed51b3
LW
353#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
354#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
355
8615cb43 356
b45f050a
JF
357/* length of regex to show in messages that don't mark a position within */
358#define RegexLengthToShowInErrorMessages 127
359
360/*
361 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
362 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
363 * op/pragma/warn/regcomp.
364 */
7253e4e3
RK
365#define MARKER1 "<-- HERE" /* marker as it appears in the description */
366#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 367
7253e4e3 368#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
369
370/*
371 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
372 * arg. Show regex, up to a maximum length. If it's too long, chop and add
373 * "...".
374 */
58e23c8d 375#define _FAIL(code) STMT_START { \
bfed75c6 376 const char *ellipses = ""; \
ccb2c380
MP
377 IV len = RExC_end - RExC_precomp; \
378 \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 if (len > RegexLengthToShowInErrorMessages) { \
382 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
383 len = RegexLengthToShowInErrorMessages - 10; \
384 ellipses = "..."; \
385 } \
58e23c8d 386 code; \
ccb2c380 387} STMT_END
8615cb43 388
58e23c8d
YO
389#define FAIL(msg) _FAIL( \
390 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
391 msg, (int)len, RExC_precomp, ellipses))
392
393#define FAIL2(msg,arg) _FAIL( \
394 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
395 arg, (int)len, RExC_precomp, ellipses))
396
b45f050a 397/*
b45f050a
JF
398 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
399 */
ccb2c380 400#define Simple_vFAIL(m) STMT_START { \
a28509cc 401 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
402 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
403 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
b45f050a
JF
405
406/*
407 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
408 */
ccb2c380
MP
409#define vFAIL(m) STMT_START { \
410 if (!SIZE_ONLY) \
411 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
412 Simple_vFAIL(m); \
413} STMT_END
b45f050a
JF
414
415/*
416 * Like Simple_vFAIL(), but accepts two arguments.
417 */
ccb2c380 418#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 419 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
420 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
422} STMT_END
b45f050a
JF
423
424/*
425 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
426 */
ccb2c380
MP
427#define vFAIL2(m,a1) STMT_START { \
428 if (!SIZE_ONLY) \
429 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
430 Simple_vFAIL2(m, a1); \
431} STMT_END
b45f050a
JF
432
433
434/*
435 * Like Simple_vFAIL(), but accepts three arguments.
436 */
ccb2c380 437#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 438 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
439 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
440 (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
b45f050a
JF
442
443/*
444 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
445 */
ccb2c380
MP
446#define vFAIL3(m,a1,a2) STMT_START { \
447 if (!SIZE_ONLY) \
448 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
449 Simple_vFAIL3(m, a1, a2); \
450} STMT_END
b45f050a
JF
451
452/*
453 * Like Simple_vFAIL(), but accepts four arguments.
454 */
ccb2c380 455#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 456 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
457 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
458 (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
b45f050a 460
ccb2c380 461#define vWARN(loc,m) STMT_START { \
a28509cc 462 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
463 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
464 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467#define vWARNdep(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
470 "%s" REPORT_LOCATION, \
471 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474
475#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 476 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
477 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
478 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
479} STMT_END
480
481#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
9d1d55b5 498
8615cb43 499
cd439c50 500/* Allow for side effects in s */
ccb2c380
MP
501#define REGC(c,s) STMT_START { \
502 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
503} STMT_END
cd439c50 504
fac92740
MJD
505/* Macros for recording node offsets. 20001227 mjd@plover.com
506 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
507 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
508 * Element 0 holds the number n.
07be1b83 509 * Position is 1 indexed.
fac92740
MJD
510 */
511
ccb2c380
MP
512#define Set_Node_Offset_To_R(node,byte) STMT_START { \
513 if (! SIZE_ONLY) { \
514 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 515 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 516 if((node) < 0) { \
551405c4 517 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
518 } else { \
519 RExC_offsets[2*(node)-1] = (byte); \
520 } \
521 } \
522} STMT_END
523
524#define Set_Node_Offset(node,byte) \
525 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
526#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
527
528#define Set_Node_Length_To_R(node,len) STMT_START { \
529 if (! SIZE_ONLY) { \
530 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 531 __LINE__, (int)(node), (int)(len))); \
ccb2c380 532 if((node) < 0) { \
551405c4 533 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
534 } else { \
535 RExC_offsets[2*(node)] = (len); \
536 } \
537 } \
538} STMT_END
539
540#define Set_Node_Length(node,len) \
541 Set_Node_Length_To_R((node)-RExC_emit_start, len)
542#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
543#define Set_Node_Cur_Length(node) \
544 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
545
546/* Get offsets and lengths */
547#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
548#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
549
07be1b83
YO
550#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
551 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
552 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
553} STMT_END
554
555
556#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
557#define EXPERIMENTAL_INPLACESCAN
558#endif
559
1de06328 560#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 561DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 562 PerlIO_printf(Perl_debug_log, \
b515a41d 563 "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf \
1de06328
YO
564 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
565 (int)(depth)*2, "", \
566 (IV)((data)->pos_min), \
567 (IV)((data)->pos_delta), \
568 (IV)((data)->flags), \
569 (IV)((data)->whilem_c), \
570 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
1de06328 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3
LW
617 }
618 else {
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
9051bda5
HS
623 if ((U32)data->offset_float_max > (U32)I32_MAX)
624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
1de06328 645 DEBUG_STUDYDATA(data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
55eed653
NC
801 dump_trie(trie,widecharmap)
802 dump_trie_interim_list(trie,widecharmap,next_alloc)
803 dump_trie_interim_table(trie,widecharmap,next_alloc)
3dab1dad
YO
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
3dab1dad
YO
816 Dumps the final compressed table form of the trie to Perl_debug_log.
817 Used for debugging make_trie().
818*/
819
820STATIC void
55eed653 821S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, U32 depth)
3dab1dad
YO
822{
823 U32 state;
ab3bbdeb 824 SV *sv=sv_newmortal();
55eed653 825 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
826 GET_RE_DEBUG_FLAGS_DECL;
827
ab3bbdeb 828
3dab1dad
YO
829 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
830 (int)depth * 2 + 2,"",
831 "Match","Base","Ofs" );
832
833 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 834 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 835 if ( tmp ) {
ab3bbdeb
YO
836 PerlIO_printf( Perl_debug_log, "%*s",
837 colwidth,
ddc5bc0f 838 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
839 PL_colors[0], PL_colors[1],
840 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
841 PERL_PV_ESCAPE_FIRSTCHAR
842 )
843 );
3dab1dad
YO
844 }
845 }
846 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
847 (int)depth * 2 + 2,"");
848
849 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 850 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
851 PerlIO_printf( Perl_debug_log, "\n");
852
1e2e3d02 853 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 854 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
855
856 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
857
858 if ( trie->states[ state ].wordnum ) {
859 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
860 } else {
861 PerlIO_printf( Perl_debug_log, "%6s", "" );
862 }
863
864 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
865
866 if ( base ) {
867 U32 ofs = 0;
868
869 while( ( base + ofs < trie->uniquecharcount ) ||
870 ( base + ofs - trie->uniquecharcount < trie->lasttrans
871 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
872 ofs++;
873
874 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
875
876 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
877 if ( ( base + ofs >= trie->uniquecharcount ) &&
878 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
879 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
880 {
ab3bbdeb
YO
881 PerlIO_printf( Perl_debug_log, "%*"UVXf,
882 colwidth,
3dab1dad
YO
883 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
884 } else {
ab3bbdeb 885 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
886 }
887 }
888
889 PerlIO_printf( Perl_debug_log, "]");
890
891 }
892 PerlIO_printf( Perl_debug_log, "\n" );
893 }
894}
895/*
3dab1dad
YO
896 Dumps a fully constructed but uncompressed trie in list form.
897 List tries normally only are used for construction when the number of
898 possible chars (trie->uniquecharcount) is very high.
899 Used for debugging make_trie().
900*/
901STATIC void
55eed653
NC
902S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
903 HV *widecharmap, U32 next_alloc, U32 depth)
3dab1dad
YO
904{
905 U32 state;
ab3bbdeb 906 SV *sv=sv_newmortal();
55eed653 907 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
908 GET_RE_DEBUG_FLAGS_DECL;
909 /* print out the table precompression. */
ab3bbdeb
YO
910 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
911 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
912 "------:-----+-----------------\n" );
3dab1dad
YO
913
914 for( state=1 ; state < next_alloc ; state ++ ) {
915 U16 charid;
916
ab3bbdeb 917 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
918 (int)depth * 2 + 2,"", (UV)state );
919 if ( ! trie->states[ state ].wordnum ) {
920 PerlIO_printf( Perl_debug_log, "%5s| ","");
921 } else {
922 PerlIO_printf( Perl_debug_log, "W%4x| ",
923 trie->states[ state ].wordnum
924 );
925 }
926 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 927 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
928 if ( tmp ) {
929 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
930 colwidth,
ddc5bc0f 931 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
932 PL_colors[0], PL_colors[1],
933 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
934 PERL_PV_ESCAPE_FIRSTCHAR
935 ) ,
1e2e3d02
YO
936 TRIE_LIST_ITEM(state,charid).forid,
937 (UV)TRIE_LIST_ITEM(state,charid).newstate
938 );
939 if (!(charid % 10))
664e119d
RGS
940 PerlIO_printf(Perl_debug_log, "\n%*s| ",
941 (int)((depth * 2) + 14), "");
1e2e3d02 942 }
ab3bbdeb
YO
943 }
944 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
945 }
946}
947
948/*
3dab1dad
YO
949 Dumps a fully constructed but uncompressed trie in table form.
950 This is the normal DFA style state transition table, with a few
951 twists to facilitate compression later.
952 Used for debugging make_trie().
953*/
954STATIC void
55eed653
NC
955S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
956 HV *widecharmap, U32 next_alloc, U32 depth)
3dab1dad
YO
957{
958 U32 state;
959 U16 charid;
ab3bbdeb 960 SV *sv=sv_newmortal();
55eed653 961 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
962 GET_RE_DEBUG_FLAGS_DECL;
963
964 /*
965 print out the table precompression so that we can do a visual check
966 that they are identical.
967 */
968
969 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
970
971 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 972 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 973 if ( tmp ) {
ab3bbdeb
YO
974 PerlIO_printf( Perl_debug_log, "%*s",
975 colwidth,
ddc5bc0f 976 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
977 PL_colors[0], PL_colors[1],
978 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
979 PERL_PV_ESCAPE_FIRSTCHAR
980 )
981 );
3dab1dad
YO
982 }
983 }
984
985 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986
987 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 988 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
989 }
990
991 PerlIO_printf( Perl_debug_log, "\n" );
992
993 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994
995 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
996 (int)depth * 2 + 2,"",
997 (UV)TRIE_NODENUM( state ) );
998
999 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1000 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1001 if (v)
1002 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1003 else
1004 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1005 }
1006 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1007 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1008 } else {
1009 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1010 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1011 }
1012 }
07be1b83 1013}
3dab1dad
YO
1014
1015#endif
1016
786e8c11
YO
1017/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1018 startbranch: the first branch in the whole branch sequence
1019 first : start branch of sequence of branch-exact nodes.
1020 May be the same as startbranch
1021 last : Thing following the last branch.
1022 May be the same as tail.
1023 tail : item following the branch sequence
1024 count : words in the sequence
1025 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1026 depth : indent depth
3dab1dad 1027
786e8c11 1028Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1029
786e8c11
YO
1030A trie is an N'ary tree where the branches are determined by digital
1031decomposition of the key. IE, at the root node you look up the 1st character and
1032follow that branch repeat until you find the end of the branches. Nodes can be
1033marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1034
786e8c11 1035 /he|she|his|hers/
72f13be8 1036
786e8c11
YO
1037would convert into the following structure. Numbers represent states, letters
1038following numbers represent valid transitions on the letter from that state, if
1039the number is in square brackets it represents an accepting state, otherwise it
1040will be in parenthesis.
07be1b83 1041
786e8c11
YO
1042 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1043 | |
1044 | (2)
1045 | |
1046 (1) +-i->(6)-+-s->[7]
1047 |
1048 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1049
786e8c11
YO
1050 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1051
1052This shows that when matching against the string 'hers' we will begin at state 1
1053read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1054then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1055is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1056single traverse. We store a mapping from accepting to state to which word was
1057matched, and then when we have multiple possibilities we try to complete the
1058rest of the regex in the order in which they occured in the alternation.
1059
1060The only prior NFA like behaviour that would be changed by the TRIE support is
1061the silent ignoring of duplicate alternations which are of the form:
1062
1063 / (DUPE|DUPE) X? (?{ ... }) Y /x
1064
1065Thus EVAL blocks follwing a trie may be called a different number of times with
1066and without the optimisation. With the optimisations dupes will be silently
1067ignored. This inconsistant behaviour of EVAL type nodes is well established as
1068the following demonstrates:
1069
1070 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1071
1072which prints out 'word' three times, but
1073
1074 'words'=~/(word|word|word)(?{ print $1 })S/
1075
1076which doesnt print it out at all. This is due to other optimisations kicking in.
1077
1078Example of what happens on a structural level:
1079
1080The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1081
1082 1: CURLYM[1] {1,32767}(18)
1083 5: BRANCH(8)
1084 6: EXACT <ac>(16)
1085 8: BRANCH(11)
1086 9: EXACT <ad>(16)
1087 11: BRANCH(14)
1088 12: EXACT <ab>(16)
1089 16: SUCCEED(0)
1090 17: NOTHING(18)
1091 18: END(0)
1092
1093This would be optimizable with startbranch=5, first=5, last=16, tail=16
1094and should turn into:
1095
1096 1: CURLYM[1] {1,32767}(18)
1097 5: TRIE(16)
1098 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1099 <ac>
1100 <ad>
1101 <ab>
1102 16: SUCCEED(0)
1103 17: NOTHING(18)
1104 18: END(0)
1105
1106Cases where tail != last would be like /(?foo|bar)baz/:
1107
1108 1: BRANCH(4)
1109 2: EXACT <foo>(8)
1110 4: BRANCH(7)
1111 5: EXACT <bar>(8)
1112 7: TAIL(8)
1113 8: EXACT <baz>(10)
1114 10: END(0)
1115
1116which would be optimizable with startbranch=1, first=1, last=7, tail=8
1117and would end up looking like:
1118
1119 1: TRIE(8)
1120 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1121 <foo>
1122 <bar>
1123 7: TAIL(8)
1124 8: EXACT <baz>(10)
1125 10: END(0)
1126
1127 d = uvuni_to_utf8_flags(d, uv, 0);
1128
1129is the recommended Unicode-aware way of saying
1130
1131 *(d++) = uv;
1132*/
1133
1e2e3d02 1134#define TRIE_STORE_REVCHAR \
786e8c11 1135 STMT_START { \
1e2e3d02 1136 SV *tmp = newSVpvs(""); \
786e8c11 1137 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1138 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1139 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1140 } STMT_END
1141
1142#define TRIE_READ_CHAR STMT_START { \
1143 wordlen++; \
1144 if ( UTF ) { \
1145 if ( folder ) { \
1146 if ( foldlen > 0 ) { \
1147 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1148 foldlen -= len; \
1149 scan += len; \
1150 len = 0; \
1151 } else { \
1152 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1153 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1154 foldlen -= UNISKIP( uvc ); \
1155 scan = foldbuf + UNISKIP( uvc ); \
1156 } \
1157 } else { \
1158 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1159 } \
1160 } else { \
1161 uvc = (U32)*uc; \
1162 len = 1; \
1163 } \
1164} STMT_END
1165
1166
1167
1168#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1169 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1170 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1171 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1172 } \
1173 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1174 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1175 TRIE_LIST_CUR( state )++; \
1176} STMT_END
07be1b83 1177
786e8c11
YO
1178#define TRIE_LIST_NEW(state) STMT_START { \
1179 Newxz( trie->states[ state ].trans.list, \
1180 4, reg_trie_trans_le ); \
1181 TRIE_LIST_CUR( state ) = 1; \
1182 TRIE_LIST_LEN( state ) = 4; \
1183} STMT_END
07be1b83 1184
786e8c11
YO
1185#define TRIE_HANDLE_WORD(state) STMT_START { \
1186 U16 dupe= trie->states[ state ].wordnum; \
1187 regnode * const noper_next = regnext( noper ); \
1188 \
1189 if (trie->wordlen) \
1190 trie->wordlen[ curword ] = wordlen; \
1191 DEBUG_r({ \
1192 /* store the word for dumping */ \
1193 SV* tmp; \
1194 if (OP(noper) != NOTHING) \
1195 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1196 else \
1197 tmp = newSVpvn( "", 0 ); \
1198 if ( UTF ) SvUTF8_on( tmp ); \
1199 av_push( trie->words, tmp ); \
1200 }); \
1201 \
1202 curword++; \
1203 \
1204 if ( noper_next < tail ) { \
1205 if (!trie->jump) \
446bd890 1206 trie->jump = PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1207 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1208 if (!jumper) \
1209 jumper = noper_next; \
1210 if (!nextbranch) \
1211 nextbranch= regnext(cur); \
1212 } \
1213 \
1214 if ( dupe ) { \
1215 /* So it's a dupe. This means we need to maintain a */\
1216 /* linked-list from the first to the next. */\
1217 /* we only allocate the nextword buffer when there */\
1218 /* a dupe, so first time we have to do the allocation */\
1219 if (!trie->nextword) \
446bd890
NC
1220 trie->nextword = \
1221 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1222 while ( trie->nextword[dupe] ) \
1223 dupe= trie->nextword[dupe]; \
1224 trie->nextword[dupe]= curword; \
1225 } else { \
1226 /* we haven't inserted this word yet. */ \
1227 trie->states[ state ].wordnum = curword; \
1228 } \
1229} STMT_END
07be1b83 1230
3dab1dad 1231
786e8c11
YO
1232#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1233 ( ( base + charid >= ucharcount \
1234 && base + charid < ubound \
1235 && state == trie->trans[ base - ucharcount + charid ].check \
1236 && trie->trans[ base - ucharcount + charid ].next ) \
1237 ? trie->trans[ base - ucharcount + charid ].next \
1238 : ( state==1 ? special : 0 ) \
1239 )
3dab1dad 1240
786e8c11
YO
1241#define MADE_TRIE 1
1242#define MADE_JUMP_TRIE 2
1243#define MADE_EXACT_TRIE 4
3dab1dad 1244
a3621e74 1245STATIC I32
786e8c11 1246S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1247{
27da23d5 1248 dVAR;
a3621e74
YO
1249 /* first pass, loop through and scan words */
1250 reg_trie_data *trie;
55eed653 1251 HV *widecharmap = NULL;
a3621e74 1252 regnode *cur;
9f7f3913 1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1254 STRLEN len = 0;
1255 UV uvc = 0;
1256 U16 curword = 0;
1257 U32 next_alloc = 0;
786e8c11
YO
1258 regnode *jumper = NULL;
1259 regnode *nextbranch = NULL;
7f69552c 1260 regnode *convert = NULL;
a3621e74 1261 /* we just use folder as a flag in utf8 */
e1ec3a88 1262 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1263 ? PL_fold
1264 : ( flags == EXACTFL
1265 ? PL_fold_locale
1266 : NULL
1267 )
1268 );
1269
55eed653 1270 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
a3621e74 1271 SV *re_trie_maxbuff;
3dab1dad
YO
1272#ifndef DEBUGGING
1273 /* these are only used during construction but are useful during
8e11feef 1274 * debugging so we store them in the struct when debugging.
8e11feef 1275 */
3dab1dad 1276 STRLEN trie_charcount=0;
3dab1dad
YO
1277 AV *trie_revcharmap;
1278#endif
a3621e74 1279 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1280#ifndef DEBUGGING
1281 PERL_UNUSED_ARG(depth);
1282#endif
a3621e74 1283
446bd890 1284 trie = PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1285 trie->refcount = 1;
3dab1dad 1286 trie->startstate = 1;
786e8c11 1287 trie->wordcount = word_count;
f8fc2ecf 1288 RExC_rxi->data->data[ data_slot ] = (void*)trie;
446bd890 1289 trie->charmap = PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1290 if (!(UTF && folder))
446bd890 1291 trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74
YO
1292 DEBUG_r({
1293 trie->words = newAV();
a3621e74 1294 });
3dab1dad 1295 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1296
0111c4fd 1297 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1298 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1299 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1300 }
3dab1dad
YO
1301 DEBUG_OPTIMISE_r({
1302 PerlIO_printf( Perl_debug_log,
786e8c11 1303 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1304 (int)depth * 2 + 2, "",
1305 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1306 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1307 (int)depth);
3dab1dad 1308 });
7f69552c
YO
1309
1310 /* Find the node we are going to overwrite */
1311 if ( first == startbranch && OP( last ) != BRANCH ) {
1312 /* whole branch chain */
1313 convert = first;
1314 } else {
1315 /* branch sub-chain */
1316 convert = NEXTOPER( first );
1317 }
1318
a3621e74
YO
1319 /* -- First loop and Setup --
1320
1321 We first traverse the branches and scan each word to determine if it
1322 contains widechars, and how many unique chars there are, this is
1323 important as we have to build a table with at least as many columns as we
1324 have unique chars.
1325
1326 We use an array of integers to represent the character codes 0..255
1327 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1328 native representation of the character value as the key and IV's for the
1329 coded index.
1330
1331 *TODO* If we keep track of how many times each character is used we can
1332 remap the columns so that the table compression later on is more
1333 efficient in terms of memory by ensuring most common value is in the
1334 middle and the least common are on the outside. IMO this would be better
1335 than a most to least common mapping as theres a decent chance the most
1336 common letter will share a node with the least common, meaning the node
1337 will not be compressable. With a middle is most common approach the worst
1338 case is when we have the least common nodes twice.
1339
1340 */
1341
a3621e74 1342 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1343 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1344 const U8 *uc = (U8*)STRING( noper );
a28509cc 1345 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1346 STRLEN foldlen = 0;
1347 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1348 const U8 *scan = (U8*)NULL;
07be1b83 1349 U32 wordlen = 0; /* required init */
3dab1dad 1350 STRLEN chars=0;
a3621e74 1351
3dab1dad
YO
1352 if (OP(noper) == NOTHING) {
1353 trie->minlen= 0;
1354 continue;
1355 }
1356 if (trie->bitmap) {
1357 TRIE_BITMAP_SET(trie,*uc);
1358 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1359 }
a3621e74 1360 for ( ; uc < e ; uc += len ) {
3dab1dad 1361 TRIE_CHARCOUNT(trie)++;
a3621e74 1362 TRIE_READ_CHAR;
3dab1dad 1363 chars++;
a3621e74
YO
1364 if ( uvc < 256 ) {
1365 if ( !trie->charmap[ uvc ] ) {
1366 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1367 if ( folder )
1368 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 } else {
1372 SV** svpp;
55eed653
NC
1373 if ( !widecharmap )
1374 widecharmap = newHV();
a3621e74 1375
55eed653 1376 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1377
1378 if ( !svpp )
e4584336 1379 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1380
1381 if ( !SvTRUE( *svpp ) ) {
1382 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1383 TRIE_STORE_REVCHAR;
a3621e74
YO
1384 }
1385 }
1386 }
3dab1dad
YO
1387 if( cur == first ) {
1388 trie->minlen=chars;
1389 trie->maxlen=chars;
1390 } else if (chars < trie->minlen) {
1391 trie->minlen=chars;
1392 } else if (chars > trie->maxlen) {
1393 trie->maxlen=chars;
1394 }
1395
a3621e74
YO
1396 } /* end first pass */
1397 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1398 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1399 (int)depth * 2 + 2,"",
55eed653 1400 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1401 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1402 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1403 );
446bd890 1404 trie->wordlen = PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1405
1406 /*
1407 We now know what we are dealing with in terms of unique chars and
1408 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1409 representation using a flat table will take. If it's over a reasonable
1410 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1411 conservative but potentially much slower representation using an array
1412 of lists.
1413
1414 At the end we convert both representations into the same compressed
1415 form that will be used in regexec.c for matching with. The latter
1416 is a form that cannot be used to construct with but has memory
1417 properties similar to the list form and access properties similar
1418 to the table form making it both suitable for fast searches and
1419 small enough that its feasable to store for the duration of a program.
1420
1421 See the comment in the code where the compressed table is produced
1422 inplace from the flat tabe representation for an explanation of how
1423 the compression works.
1424
1425 */
1426
1427
3dab1dad 1428 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1429 /*
1430 Second Pass -- Array Of Lists Representation
1431
1432 Each state will be represented by a list of charid:state records
1433 (reg_trie_trans_le) the first such element holds the CUR and LEN
1434 points of the allocated array. (See defines above).
1435
1436 We build the initial structure using the lists, and then convert
1437 it into the compressed table form which allows faster lookups
1438 (but cant be modified once converted).
a3621e74
YO
1439 */
1440
a3621e74
YO
1441 STRLEN transcount = 1;
1442
1e2e3d02
YO
1443 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1444 "%*sCompiling trie using list compiler\n",
1445 (int)depth * 2 + 2, ""));
446bd890
NC
1446
1447 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1448 sizeof(reg_trie_state) );
a3621e74
YO
1449 TRIE_LIST_NEW(1);
1450 next_alloc = 2;
1451
1452 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1453
c445ea15
AL
1454 regnode * const noper = NEXTOPER( cur );
1455 U8 *uc = (U8*)STRING( noper );
1456 const U8 * const e = uc + STR_LEN( noper );
1457 U32 state = 1; /* required init */
1458 U16 charid = 0; /* sanity init */
1459 U8 *scan = (U8*)NULL; /* sanity init */
1460 STRLEN foldlen = 0; /* required init */
07be1b83 1461 U32 wordlen = 0; /* required init */
c445ea15
AL
1462 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1463
3dab1dad 1464 if (OP(noper) != NOTHING) {
786e8c11 1465 for ( ; uc < e ; uc += len ) {
c445ea15 1466
786e8c11 1467 TRIE_READ_CHAR;
c445ea15 1468
786e8c11
YO
1469 if ( uvc < 256 ) {
1470 charid = trie->charmap[ uvc ];
c445ea15 1471 } else {
55eed653 1472 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1473 if ( !svpp ) {
1474 charid = 0;
1475 } else {
1476 charid=(U16)SvIV( *svpp );
1477 }
c445ea15 1478 }
786e8c11
YO
1479 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1480 if ( charid ) {
a3621e74 1481
786e8c11
YO
1482 U16 check;
1483 U32 newstate = 0;
a3621e74 1484
786e8c11
YO
1485 charid--;
1486 if ( !trie->states[ state ].trans.list ) {
1487 TRIE_LIST_NEW( state );
c445ea15 1488 }
786e8c11
YO
1489 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1490 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1491 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1492 break;
1493 }
1494 }
1495 if ( ! newstate ) {
1496 newstate = next_alloc++;
1497 TRIE_LIST_PUSH( state, charid, newstate );
1498 transcount++;
1499 }
1500 state = newstate;
1501 } else {
1502 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1503 }
a28509cc 1504 }
c445ea15 1505 }
3dab1dad 1506 TRIE_HANDLE_WORD(state);
a3621e74
YO
1507
1508 } /* end second pass */
1509
1e2e3d02
YO
1510 /* next alloc is the NEXT state to be allocated */
1511 trie->statecount = next_alloc;
446bd890
NC
1512 trie->states = PerlMemShared_realloc( trie->states, next_alloc
1513 * sizeof(reg_trie_state) );
a3621e74 1514
3dab1dad
YO
1515 /* and now dump it out before we compress it */
1516 DEBUG_TRIE_COMPILE_MORE_r(
55eed653 1517 dump_trie_interim_list(trie,widecharmap,next_alloc,depth+1)
1e2e3d02 1518 );
a3621e74 1519
446bd890
NC
1520 trie->trans
1521 = PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1522 {
1523 U32 state;
a3621e74
YO
1524 U32 tp = 0;
1525 U32 zp = 0;
1526
1527
1528 for( state=1 ; state < next_alloc ; state ++ ) {
1529 U32 base=0;
1530
1531 /*
1532 DEBUG_TRIE_COMPILE_MORE_r(
1533 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1534 );
1535 */
1536
1537 if (trie->states[state].trans.list) {
1538 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1539 U16 maxid=minid;
a28509cc 1540 U16 idx;
a3621e74
YO
1541
1542 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1543 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1544 if ( forid < minid ) {
1545 minid=forid;
1546 } else if ( forid > maxid ) {
1547 maxid=forid;
1548 }
a3621e74
YO
1549 }
1550 if ( transcount < tp + maxid - minid + 1) {
1551 transcount *= 2;
446bd890
NC
1552 trie->trans
1553 = PerlMemShared_realloc( trie->trans,
1554 transcount
1555 * sizeof(reg_trie_trans) );
a3621e74
YO
1556 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1557 }
1558 base = trie->uniquecharcount + tp - minid;
1559 if ( maxid == minid ) {
1560 U32 set = 0;
1561 for ( ; zp < tp ; zp++ ) {
1562 if ( ! trie->trans[ zp ].next ) {
1563 base = trie->uniquecharcount + zp - minid;
1564 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1565 trie->trans[ zp ].check = state;
1566 set = 1;
1567 break;
1568 }
1569 }
1570 if ( !set ) {
1571 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1572 trie->trans[ tp ].check = state;
1573 tp++;
1574 zp = tp;
1575 }
1576 } else {
1577 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1578 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1579 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1580 trie->trans[ tid ].check = state;
1581 }
1582 tp += ( maxid - minid + 1 );
1583 }
1584 Safefree(trie->states[ state ].trans.list);
1585 }
1586 /*
1587 DEBUG_TRIE_COMPILE_MORE_r(
1588 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1589 );
1590 */
1591 trie->states[ state ].trans.base=base;
1592 }
cc601c31 1593 trie->lasttrans = tp + 1;
a3621e74
YO
1594 }
1595 } else {
1596 /*
1597 Second Pass -- Flat Table Representation.
1598
1599 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1600 We know that we will need Charcount+1 trans at most to store the data
1601 (one row per char at worst case) So we preallocate both structures
1602 assuming worst case.
1603
1604 We then construct the trie using only the .next slots of the entry
1605 structs.
1606
1607 We use the .check field of the first entry of the node temporarily to
1608 make compression both faster and easier by keeping track of how many non
1609 zero fields are in the node.
1610
1611 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1612 transition.
1613
1614 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1615 number representing the first entry of the node, and state as a
1616 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1617 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1618 are 2 entrys per node. eg:
1619
1620 A B A B
1621 1. 2 4 1. 3 7
1622 2. 0 3 3. 0 5
1623 3. 0 0 5. 0 0
1624 4. 0 0 7. 0 0
1625
1626 The table is internally in the right hand, idx form. However as we also
1627 have to deal with the states array which is indexed by nodenum we have to
1628 use TRIE_NODENUM() to convert.
1629
1630 */
1e2e3d02
YO
1631 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1632 "%*sCompiling trie using table compiler\n",
1633 (int)depth * 2 + 2, ""));
3dab1dad 1634
446bd890
NC
1635 trie->trans = PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1636 * trie->uniquecharcount + 1,
1637 sizeof(reg_trie_trans) );
1638 trie->states = PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1639 sizeof(reg_trie_state) );
a3621e74
YO
1640 next_alloc = trie->uniquecharcount + 1;
1641
3dab1dad 1642
a3621e74
YO
1643 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1644
c445ea15 1645 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1646 const U8 *uc = (U8*)STRING( noper );
1647 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1648
1649 U32 state = 1; /* required init */
1650
1651 U16 charid = 0; /* sanity init */
1652 U32 accept_state = 0; /* sanity init */
1653 U8 *scan = (U8*)NULL; /* sanity init */
1654
1655 STRLEN foldlen = 0; /* required init */
07be1b83 1656 U32 wordlen = 0; /* required init */
a3621e74
YO
1657 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658
3dab1dad 1659 if ( OP(noper) != NOTHING ) {
786e8c11 1660 for ( ; uc < e ; uc += len ) {
a3621e74 1661
786e8c11 1662 TRIE_READ_CHAR;
a3621e74 1663
786e8c11
YO
1664 if ( uvc < 256 ) {
1665 charid = trie->charmap[ uvc ];
1666 } else {
55eed653 1667 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1668 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1669 }
786e8c11
YO
1670 if ( charid ) {
1671 charid--;
1672 if ( !trie->trans[ state + charid ].next ) {
1673 trie->trans[ state + charid ].next = next_alloc;
1674 trie->trans[ state ].check++;
1675 next_alloc += trie->uniquecharcount;
1676 }
1677 state = trie->trans[ state + charid ].next;
1678 } else {
1679 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1680 }
1681 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1682 }
a3621e74 1683 }
3dab1dad
YO
1684 accept_state = TRIE_NODENUM( state );
1685 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1686
1687 } /* end second pass */
1688
3dab1dad
YO
1689 /* and now dump it out before we compress it */
1690 DEBUG_TRIE_COMPILE_MORE_r(
55eed653 1691 dump_trie_interim_table(trie,widecharmap,next_alloc,depth+1)
3dab1dad 1692 );
a3621e74 1693
a3621e74
YO
1694 {
1695 /*
1696 * Inplace compress the table.*
1697
1698 For sparse data sets the table constructed by the trie algorithm will
1699 be mostly 0/FAIL transitions or to put it another way mostly empty.
1700 (Note that leaf nodes will not contain any transitions.)
1701
1702 This algorithm compresses the tables by eliminating most such
1703 transitions, at the cost of a modest bit of extra work during lookup:
1704
1705 - Each states[] entry contains a .base field which indicates the
1706 index in the state[] array wheres its transition data is stored.
1707
1708 - If .base is 0 there are no valid transitions from that node.
1709
1710 - If .base is nonzero then charid is added to it to find an entry in
1711 the trans array.
1712
1713 -If trans[states[state].base+charid].check!=state then the
1714 transition is taken to be a 0/Fail transition. Thus if there are fail
1715 transitions at the front of the node then the .base offset will point
1716 somewhere inside the previous nodes data (or maybe even into a node
1717 even earlier), but the .check field determines if the transition is
1718 valid.
1719
786e8c11 1720 XXX - wrong maybe?
a3621e74
YO
1721 The following process inplace converts the table to the compressed
1722 table: We first do not compress the root node 1,and mark its all its
1723 .check pointers as 1 and set its .base pointer as 1 as well. This
1724 allows to do a DFA construction from the compressed table later, and
1725 ensures that any .base pointers we calculate later are greater than
1726 0.
1727
1728 - We set 'pos' to indicate the first entry of the second node.
1729
1730 - We then iterate over the columns of the node, finding the first and
1731 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1732 and set the .check pointers accordingly, and advance pos
1733 appropriately and repreat for the next node. Note that when we copy
1734 the next pointers we have to convert them from the original
1735 NODEIDX form to NODENUM form as the former is not valid post
1736 compression.
1737
1738 - If a node has no transitions used we mark its base as 0 and do not
1739 advance the pos pointer.
1740
1741 - If a node only has one transition we use a second pointer into the
1742 structure to fill in allocated fail transitions from other states.
1743 This pointer is independent of the main pointer and scans forward
1744 looking for null transitions that are allocated to a state. When it
1745 finds one it writes the single transition into the "hole". If the
786e8c11 1746 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1747
1748 - Once compressed we can Renew/realloc the structures to release the
1749 excess space.
1750
1751 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1752 specifically Fig 3.47 and the associated pseudocode.
1753
1754 demq
1755 */
a3b680e6 1756 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1757 U32 state, charid;
a3621e74 1758 U32 pos = 0, zp=0;
1e2e3d02 1759 trie->statecount = laststate;
a3621e74
YO
1760
1761 for ( state = 1 ; state < laststate ; state++ ) {
1762 U8 flag = 0;
a28509cc
AL
1763 const U32 stateidx = TRIE_NODEIDX( state );
1764 const U32 o_used = trie->trans[ stateidx ].check;
1765 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1766 trie->trans[ stateidx ].check = 0;
1767
1768 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1769 if ( flag || trie->trans[ stateidx + charid ].next ) {
1770 if ( trie->trans[ stateidx + charid ].next ) {
1771 if (o_used == 1) {
1772 for ( ; zp < pos ; zp++ ) {
1773 if ( ! trie->trans[ zp ].next ) {
1774 break;
1775 }
1776 }
1777 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1778 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779 trie->trans[ zp ].check = state;
1780 if ( ++zp > pos ) pos = zp;
1781 break;
1782 }
1783 used--;
1784 }
1785 if ( !flag ) {
1786 flag = 1;
1787 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1788 }
1789 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1790 trie->trans[ pos ].check = state;
1791 pos++;
1792 }
1793 }
1794 }
cc601c31 1795 trie->lasttrans = pos + 1;
446bd890
NC
1796 trie->states = PerlMemShared_realloc( trie->states, laststate
1797 * sizeof(reg_trie_state) );
a3621e74 1798 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1799 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1800 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1801 (int)depth * 2 + 2,"",
1802 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1803 (IV)next_alloc,
1804 (IV)pos,
a3621e74
YO
1805 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1806 );
1807
1808 } /* end table compress */
1809 }
1e2e3d02
YO
1810 DEBUG_TRIE_COMPILE_MORE_r(
1811 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1812 (int)depth * 2 + 2, "",
1813 (UV)trie->statecount,
1814 (UV)trie->lasttrans)
1815 );
cc601c31 1816 /* resize the trans array to remove unused space */
446bd890
NC
1817 trie->trans = PerlMemShared_realloc( trie->trans, trie->lasttrans
1818 * sizeof(reg_trie_trans) );
a3621e74 1819
3dab1dad
YO
1820 /* and now dump out the compressed format */
1821 DEBUG_TRIE_COMPILE_r(
55eed653 1822 dump_trie(trie,widecharmap,depth+1)
3dab1dad 1823 );
07be1b83 1824
3dab1dad 1825 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1826 U8 nodetype =(U8)(flags & 0xFF);
1827 char *str=NULL;
786e8c11 1828
07be1b83 1829#ifdef DEBUGGING
e62cc96a 1830 regnode *optimize = NULL;
b57a0404
JH
1831 U32 mjd_offset = 0;
1832 U32 mjd_nodelen = 0;
07be1b83 1833#endif
a3621e74 1834 /*
3dab1dad
YO
1835 This means we convert either the first branch or the first Exact,
1836 depending on whether the thing following (in 'last') is a branch
1837 or not and whther first is the startbranch (ie is it a sub part of
1838 the alternation or is it the whole thing.)
1839 Assuming its a sub part we conver the EXACT otherwise we convert
1840 the whole branch sequence, including the first.
a3621e74 1841 */
3dab1dad 1842 /* Find the node we are going to overwrite */
7f69552c 1843 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1844 /* branch sub-chain */
3dab1dad 1845 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1846 DEBUG_r({
1847 mjd_offset= Node_Offset((convert));
1848 mjd_nodelen= Node_Length((convert));
1849 });
7f69552c
YO
1850 /* whole branch chain */
1851 } else {
1852 DEBUG_r({
1853 const regnode *nop = NEXTOPER( convert );
1854 mjd_offset= Node_Offset((nop));
1855 mjd_nodelen= Node_Length((nop));
1856 });
07be1b83 1857 }
7f69552c 1858
07be1b83
YO
1859 DEBUG_OPTIMISE_r(
1860 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1861 (int)depth * 2 + 2, "",
786e8c11 1862 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1863 );
a3621e74 1864
3dab1dad
YO
1865 /* But first we check to see if there is a common prefix we can
1866 split out as an EXACT and put in front of the TRIE node. */
1867 trie->startstate= 1;
55eed653 1868 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1869 U32 state;
1e2e3d02 1870 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1871 U32 ofs = 0;
8e11feef
RGS
1872 I32 idx = -1;
1873 U32 count = 0;
1874 const U32 base = trie->states[ state ].trans.base;
a3621e74 1875
3dab1dad 1876 if ( trie->states[state].wordnum )
8e11feef 1877 count = 1;
a3621e74 1878
8e11feef 1879 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1880 if ( ( base + ofs >= trie->uniquecharcount ) &&
1881 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1882 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1883 {
3dab1dad 1884 if ( ++count > 1 ) {
8e11feef 1885 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1886 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1887 if ( state == 1 ) break;
3dab1dad
YO
1888 if ( count == 2 ) {
1889 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1890 DEBUG_OPTIMISE_r(
8e11feef
RGS
1891 PerlIO_printf(Perl_debug_log,
1892 "%*sNew Start State=%"UVuf" Class: [",
1893 (int)depth * 2 + 2, "",
786e8c11 1894 (UV)state));
be8e71aa
YO
1895 if (idx >= 0) {
1896 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1897 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1898
3dab1dad 1899 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1900 if ( folder )
1901 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1902 DEBUG_OPTIMISE_r(
07be1b83 1903 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1904 );
8e11feef
RGS
1905 }
1906 }
1907 TRIE_BITMAP_SET(trie,*ch);
1908 if ( folder )
1909 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1910 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1911 }
1912 idx = ofs;
1913 }
3dab1dad
YO
1914 }
1915 if ( count == 1 ) {
1916 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
de734bd5
A
1917 char *ch = SvPV_nolen( *tmp );
1918 DEBUG_OPTIMISE_r({
1919 SV *sv=sv_newmortal();
8e11feef
RGS
1920 PerlIO_printf( Perl_debug_log,
1921 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1922 (int)depth * 2 + 2, "",
de734bd5
A
1923 (UV)state, (UV)idx,
1924 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1925 PL_colors[0], PL_colors[1],
1926 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1927 PERL_PV_ESCAPE_FIRSTCHAR
1928 )
1929 );
1930 });
3dab1dad
YO
1931 if ( state==1 ) {
1932 OP( convert ) = nodetype;
1933 str=STRING(convert);
1934 STR_LEN(convert)=0;
1935 }
de734bd5
A
1936 while (*ch) {
1937 *str++ = *ch++;
1938 STR_LEN(convert)++;
1939 }
1940
8e11feef 1941 } else {
f9049ba1 1942#ifdef DEBUGGING
8e11feef
RGS
1943 if (state>1)
1944 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1945#endif
8e11feef
RGS
1946 break;
1947 }
1948 }
3dab1dad 1949 if (str) {
8e11feef 1950 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1951 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1952 trie->startstate = state;
07be1b83
YO
1953 trie->minlen -= (state - 1);
1954 trie->maxlen -= (state - 1);
1955 DEBUG_r({
1956 regnode *fix = convert;
de734bd5 1957 U32 word = trie->wordcount;
07be1b83
YO
1958 mjd_nodelen++;
1959 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1960 while( ++fix < n ) {
1961 Set_Node_Offset_Length(fix, 0, 0);
1962 }
de734bd5
A
1963 while (word--) {
1964 SV ** const tmp = av_fetch( trie->words, word, 0 );
1965 if (tmp) {
1966 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1967 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1968 else
1969 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1970 }
1971 }
07be1b83 1972 });
8e11feef
RGS
1973 if (trie->maxlen) {
1974 convert = n;
1975 } else {
3dab1dad 1976 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1977 DEBUG_r(optimize= n);
3dab1dad
YO
1978 }
1979 }
1980 }
a5ca303d
YO
1981 if (!jumper)
1982 jumper = last;
3dab1dad 1983 if ( trie->maxlen ) {
8e11feef
RGS
1984 NEXT_OFF( convert ) = (U16)(tail - convert);
1985 ARG_SET( convert, data_slot );
786e8c11
YO
1986 /* Store the offset to the first unabsorbed branch in
1987 jump[0], which is otherwise unused by the jump logic.
1988 We use this when dumping a trie and during optimisation. */
1989 if (trie->jump)
7f69552c 1990 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1991
786e8c11
YO
1992 /* XXXX */
1993 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1994 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1995 {
1996 OP( convert ) = TRIEC;
1997 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 1998 PerlMemShared_free(trie->bitmap);
786e8c11
YO
1999 trie->bitmap= NULL;
2000 } else
2001 OP( convert ) = TRIE;
a3621e74 2002
3dab1dad
YO
2003 /* store the type in the flags */
2004 convert->flags = nodetype;
a5ca303d
YO
2005 DEBUG_r({
2006 optimize = convert
2007 + NODE_STEP_REGNODE
2008 + regarglen[ OP( convert ) ];
2009 });
2010 /* XXX We really should free up the resource in trie now,
2011 as we won't use them - (which resources?) dmq */
3dab1dad 2012 }
a3621e74 2013 /* needed for dumping*/
e62cc96a 2014 DEBUG_r(if (optimize) {
07be1b83 2015 regnode *opt = convert;
e62cc96a 2016 while ( ++opt < optimize) {
07be1b83
YO
2017 Set_Node_Offset_Length(opt,0,0);
2018 }
786e8c11
YO
2019 /*
2020 Try to clean up some of the debris left after the
2021 optimisation.
a3621e74 2022 */
786e8c11 2023 while( optimize < jumper ) {
07be1b83 2024 mjd_nodelen += Node_Length((optimize));
a3621e74 2025 OP( optimize ) = OPTIMIZED;
07be1b83 2026 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2027 optimize++;
2028 }
07be1b83 2029 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2030 });
2031 } /* end node insert */
55eed653 2032 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
07be1b83 2033#ifndef DEBUGGING
6e8b4190 2034 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 2035#endif
786e8c11
YO
2036 return trie->jump
2037 ? MADE_JUMP_TRIE
2038 : trie->startstate>1
2039 ? MADE_EXACT_TRIE
2040 : MADE_TRIE;
2041}
2042
2043STATIC void
2044S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2045{
2046/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2047
2048 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2049 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2050 ISBN 0-201-10088-6
2051
2052 We find the fail state for each state in the trie, this state is the longest proper
2053 suffix of the current states 'word' that is also a proper prefix of another word in our
2054 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2055 the DFA not to have to restart after its tried and failed a word at a given point, it
2056 simply continues as though it had been matching the other word in the first place.
2057 Consider
2058 'abcdgu'=~/abcdefg|cdgu/
2059 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2060 fail, which would bring use to the state representing 'd' in the second word where we would
2061 try 'g' and succeed, prodceding to match 'cdgu'.
2062 */
2063 /* add a fail transition */
3251b653
NC
2064 const U32 trie_offset = ARG(source);
2065 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2066 U32 *q;
2067 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2068 const U32 numstates = trie->statecount;
786e8c11
YO
2069 const U32 ubound = trie->lasttrans + ucharcount;
2070 U32 q_read = 0;
2071 U32 q_write = 0;
2072 U32 charid;
2073 U32 base = trie->states[ 1 ].trans.base;
2074 U32 *fail;
2075 reg_ac_data *aho;
2076 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2077 GET_RE_DEBUG_FLAGS_DECL;
2078#ifndef DEBUGGING
2079 PERL_UNUSED_ARG(depth);
2080#endif
2081
2082
2083 ARG_SET( stclass, data_slot );
446bd890 2084 aho = PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2085 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2086 aho->trie=trie_offset;
446bd890
NC
2087 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2088 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2089 Newxz( q, numstates, U32);
446bd890 2090 aho->fail = PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2091 aho->refcount = 1;
2092 fail = aho->fail;
2093 /* initialize fail[0..1] to be 1 so that we always have
2094 a valid final fail state */
2095 fail[ 0 ] = fail[ 1 ] = 1;
2096
2097 for ( charid = 0; charid < ucharcount ; charid++ ) {
2098 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2099 if ( newstate ) {
2100 q[ q_write ] = newstate;
2101 /* set to point at the root */
2102 fail[ q[ q_write++ ] ]=1;
2103 }
2104 }
2105 while ( q_read < q_write) {
2106 const U32 cur = q[ q_read++ % numstates ];
2107 base = trie->states[ cur ].trans.base;
2108
2109 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2110 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2111 if (ch_state) {
2112 U32 fail_state = cur;
2113 U32 fail_base;
2114 do {
2115 fail_state = fail[ fail_state ];
2116 fail_base = aho->states[ fail_state ].trans.base;
2117 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2118
2119 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2120 fail[ ch_state ] = fail_state;
2121 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2122 {
2123 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2124 }
2125 q[ q_write++ % numstates] = ch_state;
2126 }
2127 }
2128 }
2129 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2130 when we fail in state 1, this allows us to use the
2131 charclass scan to find a valid start char. This is based on the principle
2132 that theres a good chance the string being searched contains lots of stuff
2133 that cant be a start char.
2134 */
2135 fail[ 0 ] = fail[ 1 ] = 0;
2136 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2137 PerlIO_printf(Perl_debug_log,
2138 "%*sStclass Failtable (%"UVuf" states): 0",
2139 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2140 );
786e8c11
YO
2141 for( q_read=1; q_read<numstates; q_read++ ) {
2142 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2143 }
2144 PerlIO_printf(Perl_debug_log, "\n");
2145 });
2146 Safefree(q);
2147 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2148}
2149
786e8c11 2150
a3621e74 2151/*
5d1c421c
JH
2152 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2153 * These need to be revisited when a newer toolchain becomes available.
2154 */
2155#if defined(__sparc64__) && defined(__GNUC__)
2156# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2157# undef SPARC64_GCC_WORKAROUND
2158# define SPARC64_GCC_WORKAROUND 1
2159# endif
2160#endif
2161
07be1b83 2162#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2163 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2164 SV * const mysv=sv_newmortal(); \
2165 regnode *Next = regnext(scan); \
2166 regprop(RExC_rx, mysv, scan); \
7f69552c 2167 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2168 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2169 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2170 }});
07be1b83 2171
1de06328
YO
2172
2173
2174
2175
07be1b83
YO
2176#define JOIN_EXACT(scan,min,flags) \
2177 if (PL_regkind[OP(scan)] == EXACT) \
2178 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2179
be8e71aa 2180STATIC U32
07be1b83
YO
2181S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2182 /* Merge several consecutive EXACTish nodes into one. */
2183 regnode *n = regnext(scan);
2184 U32 stringok = 1;
2185 regnode *next = scan + NODE_SZ_STR(scan);
2186 U32 merged = 0;
2187 U32 stopnow = 0;
2188#ifdef DEBUGGING
2189 regnode *stop = scan;
72f13be8 2190 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2191#else
d47053eb
RGS
2192 PERL_UNUSED_ARG(depth);
2193#endif
2194#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2195 PERL_UNUSED_ARG(flags);
2196 PERL_UNUSED_ARG(val);
07be1b83 2197#endif
07be1b83
YO
2198 DEBUG_PEEP("join",scan,depth);
2199
2200 /* Skip NOTHING, merge EXACT*. */
2201 while (n &&
2202 ( PL_regkind[OP(n)] == NOTHING ||
2203 (stringok && (OP(n) == OP(scan))))
2204 && NEXT_OFF(n)
2205 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2206
2207 if (OP(n) == TAIL || n > next)
2208 stringok = 0;
2209 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2210 DEBUG_PEEP("skip:",n,depth);
2211 NEXT_OFF(scan) += NEXT_OFF(n);
2212 next = n + NODE_STEP_REGNODE;
2213#ifdef DEBUGGING
2214 if (stringok)
2215 stop = n;
2216#endif
2217 n = regnext(n);
2218 }
2219 else if (stringok) {
786e8c11 2220 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2221 regnode * const nnext = regnext(n);
2222
2223 DEBUG_PEEP("merg",n,depth);
2224
2225 merged++;
2226 if (oldl + STR_LEN(n) > U8_MAX)
2227 break;
2228 NEXT_OFF(scan) += NEXT_OFF(n);
2229 STR_LEN(scan) += STR_LEN(n);
2230 next = n + NODE_SZ_STR(n);
2231 /* Now we can overwrite *n : */
2232 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2233#ifdef DEBUGGING
2234 stop = next - 1;
2235#endif
2236 n = nnext;
2237 if (stopnow) break;
2238 }
2239
d47053eb
RGS
2240#ifdef EXPERIMENTAL_INPLACESCAN
2241 if (flags && !NEXT_OFF(n)) {
2242 DEBUG_PEEP("atch", val, depth);
2243 if (reg_off_by_arg[OP(n)]) {
2244 ARG_SET(n, val - n);
2245 }
2246 else {
2247 NEXT_OFF(n) = val - n;
2248 }
2249 stopnow = 1;
2250 }
07be1b83
YO
2251#endif
2252 }
2253
2254 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2255 /*
2256 Two problematic code points in Unicode casefolding of EXACT nodes:
2257
2258 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2259 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2260
2261 which casefold to
2262
2263 Unicode UTF-8
2264
2265 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2266 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2267
2268 This means that in case-insensitive matching (or "loose matching",
2269 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2270 length of the above casefolded versions) can match a target string
2271 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2272 This would rather mess up the minimum length computation.
2273
2274 What we'll do is to look for the tail four bytes, and then peek
2275 at the preceding two bytes to see whether we need to decrease
2276 the minimum length by four (six minus two).
2277
2278 Thanks to the design of UTF-8, there cannot be false matches:
2279 A sequence of valid UTF-8 bytes cannot be a subsequence of
2280 another valid sequence of UTF-8 bytes.
2281
2282 */
2283 char * const s0 = STRING(scan), *s, *t;
2284 char * const s1 = s0 + STR_LEN(scan) - 1;
2285 char * const s2 = s1 - 4;
e294cc5d
JH
2286#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2287 const char t0[] = "\xaf\x49\xaf\x42";
2288#else
07be1b83 2289 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2290#endif
07be1b83
YO
2291 const char * const t1 = t0 + 3;
2292
2293 for (s = s0 + 2;
2294 s < s2 && (t = ninstr(s, s1, t0, t1));
2295 s = t + 4) {
e294cc5d
JH
2296#ifdef EBCDIC
2297 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2298 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2299#else
07be1b83
YO
2300 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2301 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2302#endif
07be1b83
YO
2303 *min -= 4;
2304 }
2305 }
2306
2307#ifdef DEBUGGING
2308 /* Allow dumping */
2309 n = scan + NODE_SZ_STR(scan);
2310 while (n <= stop) {
2311 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2312 OP(n) = OPTIMIZED;
2313 NEXT_OFF(n) = 0;
2314 }
2315 n++;
2316 }
2317#endif
2318 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2319 return stopnow;
2320}
2321
653099ff
GS
2322/* REx optimizer. Converts nodes into quickier variants "in place".
2323 Finds fixed substrings. */
2324
a0288114 2325/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2326 to the position after last scanned or to NULL. */
2327
40d049e4
YO
2328#define INIT_AND_WITHP \
2329 assert(!and_withp); \
2330 Newx(and_withp,1,struct regnode_charclass_class); \
2331 SAVEFREEPV(and_withp)
07be1b83 2332
b515a41d
YO
2333/* this is a chain of data about sub patterns we are processing that
2334 need to be handled seperately/specially in study_chunk. Its so
2335 we can simulate recursion without losing state. */
2336struct scan_frame;
2337typedef struct scan_frame {
2338 regnode *last; /* last node to process in this frame */
2339 regnode *next; /* next node to process when last is reached */
2340 struct scan_frame *prev; /*previous frame*/
2341 I32 stop; /* what stopparen do we use */
2342} scan_frame;
2343
76e3520e 2344STATIC I32
40d049e4 2345S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2346 I32 *minlenp, I32 *deltap,
40d049e4
YO
2347 regnode *last,
2348 scan_data_t *data,
2349 I32 stopparen,
2350 U8* recursed,
2351 struct regnode_charclass_class *and_withp,
2352 U32 flags, U32 depth)
c277df42
IZ
2353 /* scanp: Start here (read-write). */
2354 /* deltap: Write maxlen-minlen here. */
2355 /* last: Stop before this one. */
40d049e4
YO
2356 /* data: string data about the pattern */
2357 /* stopparen: treat close N as END */
2358 /* recursed: which subroutines have we recursed into */
2359 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2360{
97aff369 2361 dVAR;
c277df42
IZ
2362 I32 min = 0, pars = 0, code;
2363 regnode *scan = *scanp, *next;
2364 I32 delta = 0;
2365 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2366 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2367 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2368 scan_data_t data_fake;
a3621e74 2369 SV *re_trie_maxbuff = NULL;
786e8c11 2370 regnode *first_non_open = scan;
e2e6a0f1 2371 I32 stopmin = I32_MAX;
8aa23a47
YO
2372 scan_frame *frame = NULL;
2373
a3621e74 2374 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2375
13a24bad 2376#ifdef DEBUGGING
40d049e4 2377 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2378#endif
40d049e4 2379
786e8c11 2380 if ( depth == 0 ) {
40d049e4 2381 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2382 first_non_open=regnext(first_non_open);
2383 }
2384
b81d288d 2385
8aa23a47
YO
2386 fake_study_recurse:
2387 while ( scan && OP(scan) != END && scan < last ){
2388 /* Peephole optimizer: */
2389 DEBUG_STUDYDATA(data,depth);
2390 DEBUG_PEEP("Peep",scan,depth);
2391 JOIN_EXACT(scan,&min,0);
2392
2393 /* Follow the next-chain of the current node and optimize
2394 away all the NOTHINGs from it. */
2395 if (OP(scan) != CURLYX) {
2396 const int max = (reg_off_by_arg[OP(scan)]
2397 ? I32_MAX
2398 /* I32 may be smaller than U16 on CRAYs! */
2399 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2400 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2401 int noff;
2402 regnode *n = scan;
2403
2404 /* Skip NOTHING and LONGJMP. */
2405 while ((n = regnext(n))
2406 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2407 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2408 && off + noff < max)
2409 off += noff;
2410 if (reg_off_by_arg[OP(scan)])
2411 ARG(scan) = off;
2412 else
2413 NEXT_OFF(scan) = off;
2414 }
a3621e74 2415
c277df42 2416
8aa23a47
YO
2417
2418 /* The principal pseudo-switch. Cannot be a switch, since we
2419 look into several different things. */
2420 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2421 || OP(scan) == IFTHEN) {
2422 next = regnext(scan);
2423 code = OP(scan);
2424 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2425
2426 if (OP(next) == code || code == IFTHEN) {
2427 /* NOTE - There is similar code to this block below for handling
2428 TRIE nodes on a re-study. If you change stuff here check there
2429 too. */
2430 I32 max1 = 0, min1 = I32_MAX, num = 0;
2431 struct regnode_charclass_class accum;
2432 regnode * const startbranch=scan;
2433
2434 if (flags & SCF_DO_SUBSTR)
2435 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2436 if (flags & SCF_DO_STCLASS)
2437 cl_init_zero(pRExC_state, &accum);
2438
2439 while (OP(scan) == code) {
2440 I32 deltanext, minnext, f = 0, fake;
2441 struct regnode_charclass_class this_class;
2442
2443 num++;
2444 data_fake.flags = 0;
2445 if (data) {
2446 data_fake.whilem_c = data->whilem_c;
2447 data_fake.last_closep = data->last_closep;
2448 }
2449 else
2450 data_fake.last_closep = &fake;
58e23c8d
YO
2451
2452 data_fake.pos_delta = delta;
8aa23a47
YO
2453 next = regnext(scan);
2454 scan = NEXTOPER(scan);
2455 if (code != BRANCH)
c277df42 2456 scan = NEXTOPER(scan);
8aa23a47
YO
2457 if (flags & SCF_DO_STCLASS) {
2458 cl_init(pRExC_state, &this_class);
2459 data_fake.start_class = &this_class;
2460 f = SCF_DO_STCLASS_AND;
58e23c8d 2461 }
8aa23a47
YO
2462 if (flags & SCF_WHILEM_VISITED_POS)
2463 f |= SCF_WHILEM_VISITED_POS;
2464
2465 /* we suppose the run is continuous, last=next...*/
2466 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2467 next, &data_fake,
2468 stopparen, recursed, NULL, f,depth+1);
2469 if (min1 > minnext)
2470 min1 = minnext;
2471 if (max1 < minnext + deltanext)
2472 max1 = minnext + deltanext;
2473 if (deltanext == I32_MAX)
2474 is_inf = is_inf_internal = 1;
2475 scan = next;
2476 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2477 pars++;
2478 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2479 if ( stopmin > minnext)
2480 stopmin = min + min1;
2481 flags &= ~SCF_DO_SUBSTR;
2482 if (data)
2483 data->flags |= SCF_SEEN_ACCEPT;
2484 }
2485 if (data) {
2486 if (data_fake.flags & SF_HAS_EVAL)
2487 data->flags |= SF_HAS_EVAL;
2488 data->whilem_c = data_fake.whilem_c;
3dab1dad 2489 }
8aa23a47
YO
2490 if (flags & SCF_DO_STCLASS)
2491 cl_or(pRExC_state, &accum, &this_class);
2492 }
2493 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2494 min1 = 0;
2495 if (flags & SCF_DO_SUBSTR) {
2496 data->pos_min += min1;
2497 data->pos_delta += max1 - min1;
2498 if (max1 != min1 || is_inf)
2499 data->longest = &(data->longest_float);
2500 }
2501 min += min1;
2502 delta += max1 - min1;
2503 if (flags & SCF_DO_STCLASS_OR) {
2504 cl_or(pRExC_state, data->start_class, &accum);
2505 if (min1) {
2506 cl_and(data->start_class, and_withp);
2507 flags &= ~SCF_DO_STCLASS;
653099ff 2508 }
8aa23a47
YO
2509 }
2510 else if (flags & SCF_DO_STCLASS_AND) {
2511 if (min1) {
2512 cl_and(data->start_class, &accum);
2513 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2514 }
8aa23a47
YO
2515 else {
2516 /* Switch to OR mode: cache the old value of
2517 * data->start_class */
2518 INIT_AND_WITHP;
2519 StructCopy(data->start_class, and_withp,
2520 struct regnode_charclass_class);
2521 flags &= ~SCF_DO_STCLASS_AND;
2522 StructCopy(&accum, data->start_class,
2523 struct regnode_charclass_class);
2524 flags |= SCF_DO_STCLASS_OR;
2525 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2526 }
8aa23a47 2527 }
a3621e74 2528
8aa23a47
YO
2529 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2530 /* demq.
a3621e74 2531
8aa23a47
YO
2532 Assuming this was/is a branch we are dealing with: 'scan' now
2533 points at the item that follows the branch sequence, whatever
2534 it is. We now start at the beginning of the sequence and look
2535 for subsequences of
a3621e74 2536
8aa23a47
YO
2537 BRANCH->EXACT=>x1
2538 BRANCH->EXACT=>x2
2539 tail
a3621e74 2540
8aa23a47 2541 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2542
8aa23a47
YO
2543 If we can find such a subseqence we need to turn the first
2544 element into a trie and then add the subsequent branch exact
2545 strings to the trie.
a3621e74 2546
8aa23a47 2547 We have two cases
a3621e74 2548
8aa23a47 2549 1. patterns where the whole set of branch can be converted.
a3621e74 2550
8aa23a47 2551 2. patterns where only a subset can be converted.
a3621e74 2552
8aa23a47
YO
2553 In case 1 we can replace the whole set with a single regop
2554 for the trie. In case 2 we need to keep the start and end
2555 branchs so
a3621e74 2556
8aa23a47
YO
2557 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2558 becomes BRANCH TRIE; BRANCH X;
786e8c11 2559
8aa23a47
YO
2560 There is an additional case, that being where there is a
2561 common prefix, which gets split out into an EXACT like node
2562 preceding the TRIE node.
a3621e74 2563
8aa23a47
YO
2564 If x(1..n)==tail then we can do a simple trie, if not we make
2565 a "jump" trie, such that when we match the appropriate word
2566 we "jump" to the appopriate tail node. Essentailly we turn
2567 a nested if into a case structure of sorts.
b515a41d 2568
8aa23a47
YO
2569 */
2570
2571 int made=0;
2572 if (!re_trie_maxbuff) {
2573 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2574 if (!SvIOK(re_trie_maxbuff))
2575 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2576 }
2577 if ( SvIV(re_trie_maxbuff)>=0 ) {
2578 regnode *cur;
2579 regnode *first = (regnode *)NULL;
2580 regnode *last = (regnode *)NULL;
2581 regnode *tail = scan;
2582 U8 optype = 0;
2583 U32 count=0;
a3621e74
YO
2584
2585#ifdef DEBUGGING
8aa23a47 2586 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2587#endif
8aa23a47
YO
2588 /* var tail is used because there may be a TAIL
2589 regop in the way. Ie, the exacts will point to the
2590 thing following the TAIL, but the last branch will
2591 point at the TAIL. So we advance tail. If we
2592 have nested (?:) we may have to move through several
2593 tails.
2594 */
2595
2596 while ( OP( tail ) == TAIL ) {
2597 /* this is the TAIL generated by (?:) */
2598 tail = regnext( tail );
2599 }
a3621e74 2600
8aa23a47
YO
2601
2602 DEBUG_OPTIMISE_r({
2603 regprop(RExC_rx, mysv, tail );
2604 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2605 (int)depth * 2 + 2, "",
2606 "Looking for TRIE'able sequences. Tail node is: ",
2607 SvPV_nolen_const( mysv )
2608 );
2609 });
2610
2611 /*
2612
2613 step through the branches, cur represents each
2614 branch, noper is the first thing to be matched
2615 as part of that branch and noper_next is the
2616 regnext() of that node. if noper is an EXACT
2617 and noper_next is the same as scan (our current
2618 position in the regex) then the EXACT branch is
2619 a possible optimization target. Once we have
2620 two or more consequetive such branches we can
2621 create a trie of the EXACT's contents and stich
2622 it in place. If the sequence represents all of
2623 the branches we eliminate the whole thing and
2624 replace it with a single TRIE. If it is a
2625 subsequence then we need to stitch it in. This
2626 means the first branch has to remain, and needs
2627 to be repointed at the item on the branch chain
2628 following the last branch optimized. This could
2629 be either a BRANCH, in which case the
2630 subsequence is internal, or it could be the
2631 item following the branch sequence in which
2632 case the subsequence is at the end.
2633
2634 */
2635
2636 /* dont use tail as the end marker for this traverse */
2637 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2638 regnode * const noper = NEXTOPER( cur );
b515a41d 2639#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2640 regnode * const noper_next = regnext( noper );
b515a41d
YO
2641#endif
2642
8aa23a47
YO
2643 DEBUG_OPTIMISE_r({
2644 regprop(RExC_rx, mysv, cur);
2645 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2646 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2647
2648 regprop(RExC_rx, mysv, noper);
2649 PerlIO_printf( Perl_debug_log, " -> %s",
2650 SvPV_nolen_const(mysv));
2651
2652 if ( noper_next ) {
2653 regprop(RExC_rx, mysv, noper_next );
2654 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2655 SvPV_nolen_const(mysv));
2656 }
2657 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2658 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2659 });
2660 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2661 : PL_regkind[ OP( noper ) ] == EXACT )
2662 || OP(noper) == NOTHING )
786e8c11 2663#ifdef NOJUMPTRIE
8aa23a47 2664 && noper_next == tail
786e8c11 2665#endif
8aa23a47
YO
2666 && count < U16_MAX)
2667 {
2668 count++;
2669 if ( !first || optype == NOTHING ) {
2670 if (!first) first = cur;
2671 optype = OP( noper );
2672 } else {
2673 last = cur;
2674 }
2675 } else {
2676 if ( last ) {
2677 make_trie( pRExC_state,
2678 startbranch, first, cur, tail, count,
2679 optype, depth+1 );
2680 }
2681 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2682#ifdef NOJUMPTRIE
8aa23a47 2683 && noper_next == tail
786e8c11 2684#endif
8aa23a47
YO
2685 ){
2686 count = 1;
2687 first = cur;
2688 optype = OP( noper );
2689 } else {
2690 count = 0;
2691 first = NULL;
2692 optype = 0;
2693 }
2694 last = NULL;
2695 }
2696 }
2697 DEBUG_OPTIMISE_r({
2698 regprop(RExC_rx, mysv, cur);
2699 PerlIO_printf( Perl_debug_log,
2700 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2701 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2702
2703 });
2704 if ( last ) {
2705 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2706#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2707 if ( ((made == MADE_EXACT_TRIE &&
2708 startbranch == first)
2709 || ( first_non_open == first )) &&
2710 depth==0 ) {
2711 flags |= SCF_TRIE_RESTUDY;
2712 if ( startbranch == first
2713 && scan == tail )
2714 {
2715 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2716 }
2717 }
3dab1dad 2718#endif
8aa23a47
YO
2719 }
2720 }
2721
2722 } /* do trie */
2723
653099ff 2724 }
8aa23a47
YO
2725 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2726 scan = NEXTOPER(NEXTOPER(scan));
2727 } else /* single branch is optimized. */
2728 scan = NEXTOPER(scan);
2729 continue;
2730 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2731 scan_frame *newframe = NULL;
2732 I32 paren;
2733 regnode *start;
2734 regnode *end;
2735
2736 if (OP(scan) != SUSPEND) {
2737 /* set the pointer */
2738 if (OP(scan) == GOSUB) {
2739 paren = ARG(scan);
2740 RExC_recurse[ARG2L(scan)] = scan;
2741 start = RExC_open_parens[paren-1];
2742 end = RExC_close_parens[paren-1];
2743 } else {
2744 paren = 0;
f8fc2ecf 2745 start = RExC_rxi->program + 1;
8aa23a47
YO
2746 end = RExC_opend;
2747 }
2748 if (!recursed) {
2749 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2750 SAVEFREEPV(recursed);
2751 }
2752 if (!PAREN_TEST(recursed,paren+1)) {
2753 PAREN_SET(recursed,paren+1);
2754 Newx(newframe,1,scan_frame);
2755 } else {
2756 if (flags & SCF_DO_SUBSTR) {
2757 scan_commit(pRExC_state,data,minlenp);
2758 data->longest = &(data->longest_float);
2759 }
2760 is_inf = is_inf_internal = 1;
2761 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2762 cl_anything(pRExC_state, data->start_class);
2763 flags &= ~SCF_DO_STCLASS;
2764 }
2765 } else {
2766 Newx(newframe,1,scan_frame);
2767 paren = stopparen;
2768 start = scan+2;
2769 end = regnext(scan);
2770 }
2771 if (newframe) {
2772 assert(start);
2773 assert(end);
2774 SAVEFREEPV(newframe);
2775 newframe->next = regnext(scan);
2776 newframe->last = last;
2777 newframe->stop = stopparen;
2778 newframe->prev = frame;
2779
2780 frame = newframe;
2781 scan = start;
2782 stopparen = paren;
2783 last = end;
2784
2785 continue;
2786 }
2787 }
2788 else if (OP(scan) == EXACT) {
2789 I32 l = STR_LEN(scan);
2790 UV uc;
2791 if (UTF) {
2792 const U8 * const s = (U8*)STRING(scan);
2793 l = utf8_length(s, s + l);
2794 uc = utf8_to_uvchr(s, NULL);
2795 } else {
2796 uc = *((U8*)STRING(scan));
2797 }
2798 min += l;
2799 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2800 /* The code below prefers earlier match for fixed
2801 offset, later match for variable offset. */
2802 if (data->last_end == -1) { /* Update the start info. */
2803 data->last_start_min = data->pos_min;
2804 data->last_start_max = is_inf
2805 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2806 }
8aa23a47
YO
2807 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2808 if (UTF)
2809 SvUTF8_on(data->last_found);
2810 {
2811 SV * const sv = data->last_found;
2812 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2813 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2814 if (mg && mg->mg_len >= 0)
2815 mg->mg_len += utf8_length((U8*)STRING(scan),
2816 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2817 }
8aa23a47
YO
2818 data->last_end = data->pos_min + l;
2819 data->pos_min += l; /* As in the first entry. */
2820 data->flags &= ~SF_BEFORE_EOL;
2821 }
2822 if (flags & SCF_DO_STCLASS_AND) {
2823 /* Check whether it is compatible with what we know already! */
2824 int compat = 1;
2825
2826 if (uc >= 0x100 ||
2827 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2828 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2829 && (!(data->start_class->flags & ANYOF_FOLD)
2830 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2831 )
2832 compat = 0;
2833 ANYOF_CLASS_ZERO(data->start_class);
2834 ANYOF_BITMAP_ZERO(data->start_class);
2835 if (compat)
2836 ANYOF_BITMAP_SET(data->start_class, uc);
2837 data->start_class->flags &= ~ANYOF_EOS;
2838 if (uc < 0x100)
2839 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2840 }
2841 else if (flags & SCF_DO_STCLASS_OR) {
2842 /* false positive possible if the class is case-folded */
2843 if (uc < 0x100)
2844 ANYOF_BITMAP_SET(data->start_class, uc);
2845 else
2846 data->start_class->flags |= ANYOF_UNICODE_ALL;
2847 data->start_class->flags &= ~ANYOF_EOS;
2848 cl_and(data->start_class, and_withp);
2849 }
2850 flags &= ~SCF_DO_STCLASS;
2851 }
2852 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2853 I32 l = STR_LEN(scan);
2854 UV uc = *((U8*)STRING(scan));
2855
2856 /* Search for fixed substrings supports EXACT only. */
2857 if (flags & SCF_DO_SUBSTR) {
2858 assert(data);
2859 scan_commit(pRExC_state, data, minlenp);
2860 }
2861 if (UTF) {
2862 const U8 * const s = (U8 *)STRING(scan);
2863 l = utf8_length(s, s + l);
2864 uc = utf8_to_uvchr(s, NULL);
2865 }
2866 min += l;
2867 if (flags & SCF_DO_SUBSTR)
2868 data->pos_min += l;
2869 if (flags & SCF_DO_STCLASS_AND) {
2870 /* Check whether it is compatible with what we know already! */
2871 int compat = 1;
2872
2873 if (uc >= 0x100 ||
2874 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2875 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2876 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2877 compat = 0;
2878 ANYOF_CLASS_ZERO(data->start_class);
2879 ANYOF_BITMAP_ZERO(data->start_class);
2880 if (compat) {
2881 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2882 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2883 data->start_class->flags |= ANYOF_FOLD;
2884 if (OP(scan) == EXACTFL)
2885 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2886 }
8aa23a47
YO
2887 }
2888 else if (flags & SCF_DO_STCLASS_OR) {
2889 if (data->start_class->flags & ANYOF_FOLD) {
2890 /* false positive possible if the class is case-folded.
2891 Assume that the locale settings are the same... */
1aa99e6b
IH
2892 if (uc < 0x100)
2893 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2894 data->start_class->flags &= ~ANYOF_EOS;
2895 }
8aa23a47 2896 cl_and(data->start_class, and_withp);
653099ff 2897 }
8aa23a47
YO
2898 flags &= ~SCF_DO_STCLASS;
2899 }
2900 else if (strchr((const char*)PL_varies,OP(scan))) {
2901 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2902 I32 f = flags, pos_before = 0;
2903 regnode * const oscan = scan;
2904 struct regnode_charclass_class this_class;
2905 struct regnode_charclass_class *oclass = NULL;
2906 I32 next_is_eval = 0;
2907
2908 switch (PL_regkind[OP(scan)]) {
2909 case WHILEM: /* End of (?:...)* . */
2910 scan = NEXTOPER(scan);
2911 goto finish;
2912 case PLUS:
2913 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2914 next = NEXTOPER(scan);
2915 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2916 mincount = 1;
2917 maxcount = REG_INFTY;
2918 next = regnext(scan);
2919 scan = NEXTOPER(scan);
2920 goto do_curly;
2921 }
2922 }
2923 if (flags & SCF_DO_SUBSTR)
2924 data->pos_min++;
2925 min++;
2926 /* Fall through. */
2927 case STAR:
2928 if (flags & SCF_DO_STCLASS) {
2929 mincount = 0;
2930 maxcount = REG_INFTY;
2931 next = regnext(scan);
2932 scan = NEXTOPER(scan);
2933 goto do_curly;
2934 }
2935 is_inf = is_inf_internal = 1;
2936 scan = regnext(scan);
c277df42 2937 if (flags & SCF_DO_SUBSTR) {
8aa23a47
YO
2938 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2939 data->longest = &(data->longest_float);
c277df42 2940 }
8aa23a47
YO
2941 goto optimize_curly_tail;
2942 case CURLY:
2943 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2944 && (scan->flags == stopparen))
2945 {
2946 mincount = 1;
2947 maxcount = 1;
2948 } else {
2949 mincount = ARG1(scan);
2950 maxcount = ARG2(scan);
653099ff 2951 }
8aa23a47
YO
2952 next = regnext(scan);
2953 if (OP(scan) == CURLYX) {
2954 I32 lp = (data ? *(data->last_closep) : 0);
2955 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2956 }
8aa23a47
YO
2957 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2958 next_is_eval = (OP(scan) == EVAL);
2959 do_curly:
2960 if (flags & SCF_DO_SUBSTR) {
2961 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2962 pos_before = data->pos_min;
b45f050a 2963 }
8aa23a47
YO
2964 if (data) {
2965 fl = data->flags;
2966 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2967 if (is_inf)
2968 data->flags |= SF_IS_INF;
2969 }
2970 if (flags & SCF_DO_STCLASS) {
2971 cl_init(pRExC_state, &this_class);
2972 oclass = data->start_class;
2973 data->start_class = &this_class;
2974 f |= SCF_DO_STCLASS_AND;
2975 f &= ~SCF_DO_STCLASS_OR;
2976 }
2977 /* These are the cases when once a subexpression
2978 fails at a particular position, it cannot succeed
2979 even after backtracking at the enclosing scope.
2980
2981 XXXX what if minimal match and we are at the
2982 initial run of {n,m}? */
2983 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2984 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 2985
8aa23a47
YO
2986 /* This will finish on WHILEM, setting scan, or on NULL: */
2987 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2988 last, data, stopparen, recursed, NULL,
2989 (mincount == 0
2990 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 2991
8aa23a47
YO
2992 if (flags & SCF_DO_STCLASS)
2993 data->start_class = oclass;
2994 if (mincount == 0 || minnext == 0) {
2995 if (flags & SCF_DO_STCLASS_OR) {
2996 cl_or(pRExC_state, data->start_class, &this_class);
2997 }
2998 else if (flags & SCF_DO_STCLASS_AND) {
2999 /* Switch to OR mode: cache the old value of
3000 * data->start_class */
3001 INIT_AND_WITHP;
3002 StructCopy(data->start_class, and_withp,
3003 struct regnode_charclass_class);
3004 flags &= ~SCF_DO_STCLASS_AND;
3005 StructCopy(&this_class, data->start_class,
3006 struct regnode_charclass_class);
3007 flags |= SCF_DO_STCLASS_OR;
3008 data->start_class->flags |= ANYOF_EOS;
3009 }
3010 } else { /* Non-zero len */
3011 if (flags & SCF_DO_STCLASS_OR) {
3012 cl_or(pRExC_state, data->start_class, &this_class);
3013 cl_and(data->start_class, and_withp);
3014 }
3015 else if (flags & SCF_DO_STCLASS_AND)
3016 cl_and(data->start_class, &this_class);
3017 flags &= ~SCF_DO_STCLASS;
3018 }
3019 if (!scan) /* It was not CURLYX, but CURLY. */
3020 scan = next;
3021 if ( /* ? quantifier ok, except for (?{ ... }) */
3022 (next_is_eval || !(mincount == 0 && maxcount == 1))
3023 && (minnext == 0) && (deltanext == 0)
3024 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3025 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3026 && ckWARN(WARN_REGEXP))
3027 {
3028 vWARN(RExC_parse,
3029 "Quantifier unexpected on zero-length expression");
3030 }
3031
3032 min += minnext * mincount;
3033 is_inf_internal |= ((maxcount == REG_INFTY
3034 && (minnext + deltanext) > 0)
3035 || deltanext == I32_MAX);
3036 is_inf |= is_inf_internal;
3037 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3038
3039 /* Try powerful optimization CURLYX => CURLYN. */
3040 if ( OP(oscan) == CURLYX && data
3041 && data->flags & SF_IN_PAR
3042 && !(data->flags & SF_HAS_EVAL)
3043 && !deltanext && minnext == 1 ) {
3044 /* Try to optimize to CURLYN. */
3045 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3046 regnode * const nxt1 = nxt;
497b47a8 3047#ifdef DEBUGGING
8aa23a47 3048 regnode *nxt2;
497b47a8 3049#endif
c277df42 3050
8aa23a47
YO
3051 /* Skip open. */
3052 nxt = regnext(nxt);
3053 if (!strchr((const char*)PL_simple,OP(nxt))
3054 && !(PL_regkind[OP(nxt)] == EXACT
3055 && STR_LEN(nxt) == 1))
3056 goto nogo;
497b47a8 3057#ifdef DEBUGGING
8aa23a47 3058 nxt2 = nxt;
497b47a8 3059#endif
8aa23a47
YO
3060 nxt = regnext(nxt);
3061 if (OP(nxt) != CLOSE)
3062 goto nogo;
3063 if (RExC_open_parens) {
3064 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3065 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3066 }
3067 /* Now we know that nxt2 is the only contents: */
3068 oscan->flags = (U8)ARG(nxt);
3069 OP(oscan) = CURLYN;
3070 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3071
c277df42 3072#ifdef DEBUGGING
8aa23a47
YO
3073 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3074 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3075 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3076 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3077 OP(nxt + 1) = OPTIMIZED; /* was count. */
3078 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3079#endif
8aa23a47
YO
3080 }
3081 nogo:
3082
3083 /* Try optimization CURLYX => CURLYM. */
3084 if ( OP(oscan) == CURLYX && data
3085 && !(data->flags & SF_HAS_PAR)
3086 && !(data->flags & SF_HAS_EVAL)
3087 && !deltanext /* atom is fixed width */
3088 && minnext != 0 /* CURLYM can't handle zero width */
3089 ) {
3090 /* XXXX How to optimize if data == 0? */
3091 /* Optimize to a simpler form. */
3092 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3093 regnode *nxt2;
3094
3095 OP(oscan) = CURLYM;
3096 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3097 && (OP(nxt2) != WHILEM))
3098 nxt = nxt2;
3099 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3100 /* Need to optimize away parenths. */
3101 if (data->flags & SF_IN_PAR) {
3102 /* Set the parenth number. */
3103 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3104
3105 if (OP(nxt) != CLOSE)
3106 FAIL("Panic opt close");
3107 oscan->flags = (U8)ARG(nxt);
3108 if (RExC_open_parens) {
3109 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3110 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3111 }
8aa23a47
YO
3112 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3113 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3114
c277df42 3115#ifdef DEBUGGING
8aa23a47
YO
3116 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3117 OP(nxt + 1) = OPTIMIZED; /* was count. */
3118 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3119 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3120#endif
c277df42 3121#if 0
8aa23a47
YO
3122 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3123 regnode *nnxt = regnext(nxt1);
3124
3125 if (nnxt == nxt) {
3126 if (reg_off_by_arg[OP(nxt1)])
3127 ARG_SET(nxt1, nxt2 - nxt1);
3128 else if (nxt2 - nxt1 < U16_MAX)
3129 NEXT_OFF(nxt1) = nxt2 - nxt1;
3130 else
3131 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3132 }
8aa23a47 3133 nxt1 = nnxt;
c277df42 3134 }
5d1c421c 3135#endif
8aa23a47
YO
3136 /* Optimize again: */
3137 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3138 NULL, stopparen, recursed, NULL, 0,depth+1);
3139 }
3140 else
3141 oscan->flags = 0;
3142 }
3143 else if ((OP(oscan) == CURLYX)
3144 && (flags & SCF_WHILEM_VISITED_POS)
3145 /* See the comment on a similar expression above.
3146 However, this time it not a subexpression
3147 we care about, but the expression itself. */
3148 && (maxcount == REG_INFTY)
3149 && data && ++data->whilem_c < 16) {
3150 /* This stays as CURLYX, we can put the count/of pair. */
3151 /* Find WHILEM (as in regexec.c) */
3152 regnode *nxt = oscan + NEXT_OFF(oscan);
3153
3154 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3155 nxt += ARG(nxt);
3156 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3157 | (RExC_whilem_seen << 4)); /* On WHILEM */
3158 }
3159 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3160 pars++;
3161 if (flags & SCF_DO_SUBSTR) {
3162 SV *last_str = NULL;
3163 int counted = mincount != 0;
a0ed51b3 3164
8aa23a47
YO
3165 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3166#if defined(SPARC64_GCC_WORKAROUND)
3167 I32 b = 0;
3168 STRLEN l = 0;
3169 const char *s = NULL;
3170 I32 old = 0;
b515a41d 3171
8aa23a47
YO
3172 if (pos_before >= data->last_start_min)
3173 b = pos_before;
3174 else
3175 b = data->last_start_min;
b515a41d 3176
8aa23a47
YO
3177 l = 0;
3178 s = SvPV_const(data->last_found, l);
3179 old = b - data->last_start_min;
3180
3181#else
3182 I32 b = pos_before >= data->last_start_min
3183 ? pos_before : data->last_start_min;
3184 STRLEN l;
3185 const char * const s = SvPV_const(data->last_found, l);
3186 I32 old = b - data->last_start_min;
3187#endif
3188
3189 if (UTF)
3190 old = utf8_hop((U8*)s, old) - (U8*)s;
3191
3192 l -= old;
3193 /* Get the added string: */
3194 last_str = newSVpvn(s + old, l);
3195 if (UTF)
3196 SvUTF8_on(last_str);
3197 if (deltanext == 0 && pos_before == b) {
3198 /* What was added is a constant string */
3199 if (mincount > 1) {
3200 SvGROW(last_str, (mincount * l) + 1);
3201 repeatcpy(SvPVX(last_str) + l,
3202 SvPVX_const(last_str), l, mincount - 1);
3203 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3204 /* Add additional parts. */
3205 SvCUR_set(data->last_found,
3206 SvCUR(data->last_found) - l);
3207 sv_catsv(data->last_found, last_str);
3208 {
3209 SV * sv = data->last_found;
3210 MAGIC *mg =
3211 SvUTF8(sv) && SvMAGICAL(sv) ?
3212 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3213 if (mg && mg->mg_len >= 0)
3214 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3215 }
8aa23a47 3216 data->last_end += l * (mincount - 1);
b515a41d 3217 }
8aa23a47
YO
3218 } else {
3219 /* start offset must point into the last copy */
3220 data->last_start_min += minnext * (mincount - 1);
3221 data->last_start_max += is_inf ? I32_MAX
3222 : (maxcount - 1) * (minnext + data->pos_delta);
3223 }
c277df42 3224 }
8aa23a47
YO
3225 /* It is counted once already... */
3226 data->pos_min += minnext * (mincount - counted);
3227 data->pos_delta += - counted * deltanext +
3228 (minnext + deltanext) * maxcount - minnext * mincount;
3229 if (mincount != maxcount) {
3230 /* Cannot extend fixed substrings found inside
3231 the group. */
3232 scan_commit(pRExC_state,data,minlenp);
3233 if (mincount && last_str) {
3234 SV * const sv = data->last_found;
3235 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3236 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3237
3238 if (mg)
3239 mg->mg_len = -1;
3240 sv_setsv(sv, last_str);
3241 data->last_end = data->pos_min;
3242 data->last_start_min =
3243 data->pos_min - CHR_SVLEN(last_str);
3244 data->last_start_max = is_inf
3245 ? I32_MAX
3246 : data->pos_min + data->pos_delta
3247 - CHR_SVLEN(last_str);
3248 }
3249 data->longest = &(data->longest_float);
3250 }
3251 SvREFCNT_dec(last_str);
c277df42 3252 }
8aa23a47
YO
3253 if (data && (fl & SF_HAS_EVAL))
3254 data->flags |= SF_HAS_EVAL;
3255 optimize_curly_tail:
3256 if (OP(oscan) != CURLYX) {
3257 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3258 && NEXT_OFF(next))
3259 NEXT_OFF(oscan) += NEXT_OFF(next);
3260 }
3261 continue;
3262 default: /* REF and CLUMP only? */
3263 if (flags & SCF_DO_SUBSTR) {
3264 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3265 data->longest = &(data->longest_float);
3266 }
3267 is_inf = is_inf_internal = 1;
3268 if (flags & SCF_DO_STCLASS_OR)
3269 cl_anything(pRExC_state, data->start_class);
3270 flags &= ~SCF_DO_STCLASS;
3271 break;
c277df42 3272 }
8aa23a47
YO
3273 }
3274 else if (strchr((const char*)PL_simple,OP(scan))) {
3275 int value = 0;
653099ff 3276
8aa23a47
YO
3277 if (flags & SCF_DO_SUBSTR) {
3278 scan_commit(pRExC_state,data,minlenp);
3279 data->pos_min++;
3280 }
3281 min++;
3282 if (flags & SCF_DO_STCLASS) {
3283 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3284
8aa23a47
YO
3285 /* Some of the logic below assumes that switching
3286 locale on will only add false positives. */
3287 switch (PL_regkind[OP(scan)]) {
3288 case SANY:
3289 default:
3290 do_default:
3291 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3292 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3293 cl_anything(pRExC_state, data->start_class);
3294 break;
3295 case REG_ANY:
3296 if (OP(scan) == SANY)
3297 goto do_default;
3298 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3299 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3300 || (data->start_class->flags & ANYOF_CLASS));
3301 cl_anything(pRExC_state, data->start_class);
653099ff 3302 }
8aa23a47
YO
3303 if (flags & SCF_DO_STCLASS_AND || !value)
3304 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3305 break;
3306 case ANYOF:
3307 if (flags & SCF_DO_STCLASS_AND)
3308 cl_and(data->start_class,
3309 (struct regnode_charclass_class*)scan);
653099ff 3310 else
8aa23a47
YO
3311 cl_or(pRExC_state, data->start_class,
3312 (struct regnode_charclass_class*)scan);
3313 break;
3314 case ALNUM:
3315 if (flags & SCF_DO_STCLASS_AND) {
3316 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3317 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3318 for (value = 0; value < 256; value++)
3319 if (!isALNUM(value))
3320 ANYOF_BITMAP_CLEAR(data->start_class, value);
3321 }
653099ff 3322 }
8aa23a47
YO
3323 else {
3324 if (data->start_class->flags & ANYOF_LOCALE)
3325 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3326 else {
3327 for (value = 0; value < 256; value++)
3328 if (isALNUM(value))
3329 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3330 }
8aa23a47
YO
3331 }
3332 break;
3333 case ALNUML:
3334 if (flags & SCF_DO_STCLASS_AND) {
3335 if (data->start_class->flags & ANYOF_LOCALE)
3336 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3337 }
3338 else {
3339 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3340 data->start_class->flags |= ANYOF_LOCALE;
3341 }
3342 break;
3343 case NALNUM:
3344 if (flags & SCF_DO_STCLASS_AND) {
3345 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3346 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3347 for (value = 0; value < 256; value++)
3348 if (isALNUM(value))
3349 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3350 }
3351 }
8aa23a47
YO
3352 else {
3353 if (data->start_class->flags & ANYOF_LOCALE)
3354 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3355 else {
3356 for (value = 0; value < 256; value++)
3357 if (!isALNUM(value))
3358 ANYOF_BITMAP_SET(data->start_class, value);
3359 }
653099ff 3360 }
8aa23a47
YO
3361 break;
3362 case NALNUML:
3363 if (flags & SCF_DO_STCLASS_AND) {
3364 if (data->start_class->flags & ANYOF_LOCALE)
3365 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3366 }
8aa23a47
YO
3367 else {
3368 data->start_class->flags |= ANYOF_LOCALE;
3369 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3370 }
3371 break;
3372 case SPACE:
3373 if (flags & SCF_DO_STCLASS_AND) {
3374 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3375 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3376 for (value = 0; value < 256; value++)
3377 if (!isSPACE(value))
3378 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3379 }
3380 }
8aa23a47
YO
3381 else {
3382 if (data->start_class->flags & ANYOF_LOCALE)
3383 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3384 else {
3385 for (value = 0; value < 256; value++)
3386 if (isSPACE(value))
3387 ANYOF_BITMAP_SET(data->start_class, value);
3388 }
653099ff 3389 }
8aa23a47
YO
3390 break;
3391 case SPACEL:
3392 if (flags & SCF_DO_STCLASS_AND) {
3393 if (data->start_class->flags & ANYOF_LOCALE)
3394 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3395 }
3396 else {
3397 data->start_class->flags |= ANYOF_LOCALE;
3398 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3399 }
3400 break;
3401 case NSPACE:
3402 if (flags & SCF_DO_STCLASS_AND) {
3403 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3404 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3405 for (value = 0; value < 256; value++)
3406 if (isSPACE(value))
3407 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3408 }
8aa23a47
YO
3409 }
3410 else {
3411 if (data->start_class->flags & ANYOF_LOCALE)
3412 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3413 else {
3414 for (value = 0; value < 256; value++)
3415 if (!isSPACE(value))
3416 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3417 }
3418 }
8aa23a47
YO
3419 break;
3420 case NSPACEL:
3421 if (flags & SCF_DO_STCLASS_AND) {
3422 if (data->start_class->flags & ANYOF_LOCALE) {
3423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3424 for (value = 0; value < 256; value++)
3425 if (!isSPACE(value))
3426 ANYOF_BITMAP_CLEAR(data->start_class, value);
3427 }
653099ff 3428 }
8aa23a47
YO
3429 else {
3430 data->start_class->flags |= ANYOF_LOCALE;
3431 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3432 }
3433 break;
3434 case DIGIT:
3435 if (flags & SCF_DO_STCLASS_AND) {
3436 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3437 for (value = 0; value < 256; value++)
3438 if (!isDIGIT(value))
3439 ANYOF_BITMAP_CLEAR(data->start_class, value);
3440 }
3441 else {
3442 if (data->start_class->flags & ANYOF_LOCALE)
3443 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3444 else {
3445 for (value = 0; value < 256; value++)
3446 if (isDIGIT(value))
3447 ANYOF_BITMAP_SET(data->start_class, value);
3448 }
3449 }
3450 break;
3451 case NDIGIT:
3452 if (flags & SCF_DO_STCLASS_AND) {
3453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3454 for (value = 0; value < 256; value++)
3455 if (isDIGIT(value))
3456 ANYOF_BITMAP_CLEAR(data->start_class, value);
3457 }
3458 else {
3459 if (data->start_class->flags & ANYOF_LOCALE)
3460 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3461 else {
3462 for (value = 0; value < 256; value++)
3463 if (!isDIGIT(value))
3464 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3465 }
3466 }
8aa23a47
YO
3467 break;
3468 }
3469 if (flags & SCF_DO_STCLASS_OR)
3470 cl_and(data->start_class, and_withp);
3471 flags &= ~SCF_DO_STCLASS;
3472 }
3473 }
3474 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3475 data->flags |= (OP(scan) == MEOL
3476 ? SF_BEFORE_MEOL
3477 : SF_BEFORE_SEOL);
3478 }
3479 else if ( PL_regkind[OP(scan)] == BRANCHJ
3480 /* Lookbehind, or need to calculate parens/evals/stclass: */
3481 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3482 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3483 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3484 || OP(scan) == UNLESSM )
3485 {
3486 /* Negative Lookahead/lookbehind
3487 In this case we can't do fixed string optimisation.
3488 */
1de06328 3489
8aa23a47
YO
3490 I32 deltanext, minnext, fake = 0;
3491 regnode *nscan;
3492 struct regnode_charclass_class intrnl;
3493 int f = 0;
1de06328 3494
8aa23a47
YO
3495 data_fake.flags = 0;
3496 if (data) {
3497 data_fake.whilem_c = data->whilem_c;
3498 data_fake.last_closep = data->last_closep;
c277df42 3499 }
8aa23a47
YO
3500 else
3501 data_fake.last_closep = &fake;
58e23c8d 3502 data_fake.pos_delta = delta;
8aa23a47
YO
3503 if ( flags & SCF_DO_STCLASS && !scan->flags
3504 && OP(scan) == IFMATCH ) { /* Lookahead */
3505 cl_init(pRExC_state, &intrnl);
3506 data_fake.start_class = &intrnl;
3507 f |= SCF_DO_STCLASS_AND;
3508 }
3509 if (flags & SCF_WHILEM_VISITED_POS)
3510 f |= SCF_WHILEM_VISITED_POS;
3511 next = regnext(scan);
3512 nscan = NEXTOPER(NEXTOPER(scan));
3513 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3514 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3515 if (scan->flags) {
3516 if (deltanext) {
58e23c8d 3517 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3518 }
3519 else if (minnext > (I32)U8_MAX) {
58e23c8d 3520 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3521 }
3522 scan->flags = (U8)minnext;
3523 }
3524 if (data) {
3525 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3526 pars++;
3527 if (data_fake.flags & SF_HAS_EVAL)
3528 data->flags |= SF_HAS_EVAL;
3529 data->whilem_c = data_fake.whilem_c;
3530 }
3531 if (f & SCF_DO_STCLASS_AND) {
3532 const int was = (data->start_class->flags & ANYOF_EOS);
3533
3534 cl_and(data->start_class, &intrnl);
3535 if (was)
3536 data->start_class->flags |= ANYOF_EOS;
3537 }
cb434fcc 3538 }
8aa23a47
YO
3539#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3540 else {
3541 /* Positive Lookahead/lookbehind
3542 In this case we can do fixed string optimisation,
3543 but we must be careful about it. Note in the case of
3544 lookbehind the positions will be offset by the minimum
3545 length of the pattern, something we won't know about
3546 until after the recurse.
3547 */
3548 I32 deltanext, fake = 0;
3549 regnode *nscan;
3550 struct regnode_charclass_class intrnl;
3551 int f = 0;
3552 /* We use SAVEFREEPV so that when the full compile
3553 is finished perl will clean up the allocated
3554 minlens when its all done. This was we don't
3555 have to worry about freeing them when we know
3556 they wont be used, which would be a pain.
3557 */
3558 I32 *minnextp;
3559 Newx( minnextp, 1, I32 );
3560 SAVEFREEPV(minnextp);
3561
3562 if (data) {
3563 StructCopy(data, &data_fake, scan_data_t);
3564 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3565 f |= SCF_DO_SUBSTR;
3566 if (scan->flags)
3567 scan_commit(pRExC_state, &data_fake,minlenp);
3568 data_fake.last_found=newSVsv(data->last_found);
3569 }
3570 }
3571 else
3572 data_fake.last_closep = &fake;
3573 data_fake.flags = 0;
58e23c8d 3574 data_fake.pos_delta = delta;
8aa23a47
YO
3575 if (is_inf)
3576 data_fake.flags |= SF_IS_INF;
3577 if ( flags & SCF_DO_STCLASS && !scan->flags
3578 && OP(scan) == IFMATCH ) { /* Lookahead */
3579 cl_init(pRExC_state, &intrnl);
3580 data_fake.start_class = &intrnl;
3581 f |= SCF_DO_STCLASS_AND;
3582 }
3583 if (flags & SCF_WHILEM_VISITED_POS)
3584 f |= SCF_WHILEM_VISITED_POS;
3585 next = regnext(scan);
3586 nscan = NEXTOPER(NEXTOPER(scan));
3587
3588 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3589 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3590 if (scan->flags) {
3591 if (deltanext) {
58e23c8d 3592 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3593 }
3594 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3595 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3596 }
3597 scan->flags = (U8)*minnextp;
3598 }
3599
3600 *minnextp += min;
3601
3602 if (f & SCF_DO_STCLASS_AND) {
3603 const int was = (data->start_class->flags & ANYOF_EOS);
3604
3605 cl_and(data->start_class, &intrnl);
3606 if (was)
3607 data->start_class->flags |= ANYOF_EOS;
3608 }
3609 if (data) {
3610 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3611 pars++;
3612 if (data_fake.flags & SF_HAS_EVAL)
3613 data->flags |= SF_HAS_EVAL;
3614 data->whilem_c = data_fake.whilem_c;
3615 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3616 if (RExC_rx->minlen<*minnextp)
3617 RExC_rx->minlen=*minnextp;
3618 scan_commit(pRExC_state, &data_fake, minnextp);
3619 SvREFCNT_dec(data_fake.last_found);
3620
3621 if ( data_fake.minlen_fixed != minlenp )
3622 {
3623 data->offset_fixed= data_fake.offset_fixed;
3624 data->minlen_fixed= data_fake.minlen_fixed;
3625 data->lookbehind_fixed+= scan->flags;
3626 }
3627 if ( data_fake.minlen_float != minlenp )
3628 {
3629 data->minlen_float= data_fake.minlen_float;
3630 data->offset_float_min=data_fake.offset_float_min;
3631 data->offset_float_max=data_fake.offset_float_max;
3632 data->lookbehind_float+= scan->flags;
3633 }
3634 }
3635 }
3636
3637
40d049e4 3638 }
8aa23a47
YO
3639#endif
3640 }
3641 else if (OP(scan) == OPEN) {
3642 if (stopparen != (I32)ARG(scan))
3643 pars++;
3644 }
3645 else if (OP(scan) == CLOSE) {
3646 if (stopparen == (I32)ARG(scan)) {
3647 break;
3648 }
3649 if ((I32)ARG(scan) == is_par) {
3650 next = regnext(scan);
b515a41d 3651
8aa23a47
YO
3652 if ( next && (OP(next) != WHILEM) && next < last)
3653 is_par = 0; /* Disable optimization */
40d049e4 3654 }
8aa23a47
YO
3655 if (data)
3656 *(data->last_closep) = ARG(scan);
3657 }
3658 else if (OP(scan) == EVAL) {
c277df42
IZ
3659 if (data)
3660 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3661 }
3662 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3663 if (flags & SCF_DO_SUBSTR) {
3664 scan_commit(pRExC_state,data,minlenp);
3665 flags &= ~SCF_DO_SUBSTR;
40d049e4 3666 }
8aa23a47
YO
3667 if (data && OP(scan)==ACCEPT) {
3668 data->flags |= SCF_SEEN_ACCEPT;
3669 if (stopmin > min)
3670 stopmin = min;
e2e6a0f1 3671 }
8aa23a47
YO
3672 }
3673 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3674 {
0f5d15d6 3675 if (flags & SCF_DO_SUBSTR) {
1de06328 3676 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3677 data->longest = &(data->longest_float);
3678 }
3679 is_inf = is_inf_internal = 1;
653099ff 3680 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3681 cl_anything(pRExC_state, data->start_class);
96776eda 3682 flags &= ~SCF_DO_STCLASS;
8aa23a47 3683 }
58e23c8d 3684 else if (OP(scan) == GPOS) {
bbe252da 3685 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3686 !(delta || is_inf || (data && data->pos_delta)))
3687 {
bbe252da
YO
3688 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3689 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3690 if (RExC_rx->gofs < (U32)min)
3691 RExC_rx->gofs = min;
3692 } else {
bbe252da 3693 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3694 RExC_rx->gofs = 0;
3695 }
3696 }
786e8c11 3697#ifdef TRIE_STUDY_OPT
40d049e4 3698#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3699 else if (PL_regkind[OP(scan)] == TRIE) {
3700 /* NOTE - There is similar code to this block above for handling
3701 BRANCH nodes on the initial study. If you change stuff here
3702 check there too. */
3703 regnode *trie_node= scan;
3704 regnode *tail= regnext(scan);
f8fc2ecf 3705 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3706 I32 max1 = 0, min1 = I32_MAX;
3707 struct regnode_charclass_class accum;
3708
3709 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3710 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3711 if (flags & SCF_DO_STCLASS)
3712 cl_init_zero(pRExC_state, &accum);
3713
3714 if (!trie->jump) {
3715 min1= trie->minlen;
3716 max1= trie->maxlen;
3717 } else {
3718 const regnode *nextbranch= NULL;
3719 U32 word;
3720
3721 for ( word=1 ; word <= trie->wordcount ; word++)
3722 {
3723 I32 deltanext=0, minnext=0, f = 0, fake;
3724 struct regnode_charclass_class this_class;
3725
3726 data_fake.flags = 0;
3727 if (data) {
3728 data_fake.whilem_c = data->whilem_c;
3729 data_fake.last_closep = data->last_closep;
3730 }
3731 else
3732 data_fake.last_closep = &fake;
58e23c8d 3733 data_fake.pos_delta = delta;
8aa23a47
YO
3734 if (flags & SCF_DO_STCLASS) {
3735 cl_init(pRExC_state, &this_class);
3736 data_fake.start_class = &this_class;
3737 f = SCF_DO_STCLASS_AND;
3738 }
3739 if (flags & SCF_WHILEM_VISITED_POS)
3740 f |= SCF_WHILEM_VISITED_POS;
3741
3742 if (trie->jump[word]) {
3743 if (!nextbranch)
3744 nextbranch = trie_node + trie->jump[0];
3745 scan= trie_node + trie->jump[word];
3746 /* We go from the jump point to the branch that follows
3747 it. Note this means we need the vestigal unused branches
3748 even though they arent otherwise used.
3749 */
3750 minnext = study_chunk(pRExC_state, &scan, minlenp,
3751 &deltanext, (regnode *)nextbranch, &data_fake,
3752 stopparen, recursed, NULL, f,depth+1);
3753 }
3754 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3755 nextbranch= regnext((regnode*)nextbranch);
3756
3757 if (min1 > (I32)(minnext + trie->minlen))
3758 min1 = minnext + trie->minlen;
3759 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3760 max1 = minnext + deltanext + trie->maxlen;
3761 if (deltanext == I32_MAX)
3762 is_inf = is_inf_internal = 1;
3763
3764 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3765 pars++;
3766 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3767 if ( stopmin > min + min1)
3768 stopmin = min + min1;
3769 flags &= ~SCF_DO_SUBSTR;
3770 if (data)
3771 data->flags |= SCF_SEEN_ACCEPT;
3772 }
3773 if (data) {
3774 if (data_fake.flags & SF_HAS_EVAL)
3775 data->flags |= SF_HAS_EVAL;
3776 data->whilem_c = data_fake.whilem_c;
3777 }
3778 if (flags & SCF_DO_STCLASS)
3779 cl_or(pRExC_state, &accum, &this_class);
3780 }
3781 }
3782 if (flags & SCF_DO_SUBSTR) {
3783 data->pos_min += min1;
3784 data->pos_delta += max1 - min1;
3785 if (max1 != min1 || is_inf)
3786 data->longest = &(data->longest_float);
3787 }
3788 min += min1;
3789 delta += max1 - min1;
3790 if (flags & SCF_DO_STCLASS_OR) {
3791 cl_or(pRExC_state, data->start_class, &accum);
3792 if (min1) {
3793 cl_and(data->start_class, and_withp);
3794 flags &= ~SCF_DO_STCLASS;
3795 }
3796 }
3797 else if (flags & SCF_DO_STCLASS_AND) {
3798 if (min1) {
3799 cl_and(data->start_class, &accum);
3800 flags &= ~SCF_DO_STCLASS;
3801 }
3802 else {
3803 /* Switch to OR mode: cache the old value of
3804 * data->start_class */
3805 INIT_AND_WITHP;
3806 StructCopy(data->start_class, and_withp,
3807 struct regnode_charclass_class);
3808 flags &= ~SCF_DO_STCLASS_AND;
3809 StructCopy(&accum, data->start_class,
3810 struct regnode_charclass_class);
3811 flags |= SCF_DO_STCLASS_OR;
3812 data->start_class->flags |= ANYOF_EOS;
3813 }
3814 }
3815 scan= tail;
3816 continue;
3817 }
786e8c11 3818#else
8aa23a47 3819 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3820 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3821 U8*bang=NULL;
3822
3823 min += trie->minlen;
3824 delta += (trie->maxlen - trie->minlen);
3825 flags &= ~SCF_DO_STCLASS; /* xxx */
3826 if (flags & SCF_DO_SUBSTR) {
3827 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3828 data->pos_min += trie->minlen;
3829 data->pos_delta += (trie->maxlen - trie->minlen);
3830 if (trie->maxlen != trie->minlen)
3831 data->longest = &(data->longest_float);
3832 }
3833 if (trie->jump) /* no more substrings -- for now /grr*/
3834 flags &= ~SCF_DO_SUBSTR;
b515a41d 3835 }
8aa23a47
YO
3836#endif /* old or new */
3837#endif /* TRIE_STUDY_OPT */
3838 /* Else: zero-length, ignore. */
3839 scan = regnext(scan);
3840 }
3841 if (frame) {
3842 last = frame->last;
3843 scan = frame->next;
3844 stopparen = frame->stop;
3845 frame = frame->prev;
3846 goto fake_study_recurse;
c277df42
IZ