This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
C++: regcomp.c and ext/Time/Piece/Piece.xs
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
40d049e4
YO
129 regnode **recurse; /* Recurse regops */
130 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
131#if ADD_TO_REGEXEC
132 char *starttry; /* -Dr: where regtry was called. */
133#define RExC_starttry (pRExC_state->starttry)
134#endif
3dab1dad 135#ifdef DEBUGGING
be8e71aa 136 const char *lastparse;
3dab1dad
YO
137 I32 lastnum;
138#define RExC_lastparse (pRExC_state->lastparse)
139#define RExC_lastnum (pRExC_state->lastnum)
140#endif
830247a4
IZ
141} RExC_state_t;
142
e2509266 143#define RExC_flags (pRExC_state->flags)
830247a4
IZ
144#define RExC_precomp (pRExC_state->precomp)
145#define RExC_rx (pRExC_state->rx)
f8fc2ecf 146#define RExC_rxi (pRExC_state->rxi)
fac92740 147#define RExC_start (pRExC_state->start)
830247a4
IZ
148#define RExC_end (pRExC_state->end)
149#define RExC_parse (pRExC_state->parse)
150#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 151#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 152#define RExC_emit (pRExC_state->emit)
fac92740 153#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
154#define RExC_naughty (pRExC_state->naughty)
155#define RExC_sawback (pRExC_state->sawback)
156#define RExC_seen (pRExC_state->seen)
157#define RExC_size (pRExC_state->size)
158#define RExC_npar (pRExC_state->npar)
e2e6a0f1 159#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 164#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
165#define RExC_open_parens (pRExC_state->open_parens)
166#define RExC_close_parens (pRExC_state->close_parens)
167#define RExC_opend (pRExC_state->opend)
81714fb9 168#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
169#define RExC_recurse (pRExC_state->recurse)
170#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 171
a687059c
LW
172#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
173#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
174 ((*s) == '{' && regcurly(s)))
a687059c 175
35c8bce7
LW
176#ifdef SPSTART
177#undef SPSTART /* dratted cpp namespace... */
178#endif
a687059c
LW
179/*
180 * Flags to be passed up and down.
181 */
a687059c 182#define WORST 0 /* Worst case. */
821b33a5 183#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
184#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
185#define SPSTART 0x4 /* Starts with * or +. */
186#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 187
3dab1dad
YO
188#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
189
07be1b83
YO
190/* whether trie related optimizations are enabled */
191#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
192#define TRIE_STUDY_OPT
786e8c11 193#define FULL_TRIE_STUDY
07be1b83
YO
194#define TRIE_STCLASS
195#endif
1de06328
YO
196
197
40d049e4
YO
198
199#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
200#define PBITVAL(paren) (1 << ((paren) & 7))
201#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
202#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
203#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
204
205
1de06328
YO
206/* About scan_data_t.
207
208 During optimisation we recurse through the regexp program performing
209 various inplace (keyhole style) optimisations. In addition study_chunk
210 and scan_commit populate this data structure with information about
211 what strings MUST appear in the pattern. We look for the longest
212 string that must appear for at a fixed location, and we look for the
213 longest string that may appear at a floating location. So for instance
214 in the pattern:
215
216 /FOO[xX]A.*B[xX]BAR/
217
218 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
219 strings (because they follow a .* construct). study_chunk will identify
220 both FOO and BAR as being the longest fixed and floating strings respectively.
221
222 The strings can be composites, for instance
223
224 /(f)(o)(o)/
225
226 will result in a composite fixed substring 'foo'.
227
228 For each string some basic information is maintained:
229
230 - offset or min_offset
231 This is the position the string must appear at, or not before.
232 It also implicitly (when combined with minlenp) tells us how many
233 character must match before the string we are searching.
234 Likewise when combined with minlenp and the length of the string
235 tells us how many characters must appear after the string we have
236 found.
237
238 - max_offset
239 Only used for floating strings. This is the rightmost point that
240 the string can appear at. Ifset to I32 max it indicates that the
241 string can occur infinitely far to the right.
242
243 - minlenp
244 A pointer to the minimum length of the pattern that the string
245 was found inside. This is important as in the case of positive
246 lookahead or positive lookbehind we can have multiple patterns
247 involved. Consider
248
249 /(?=FOO).*F/
250
251 The minimum length of the pattern overall is 3, the minimum length
252 of the lookahead part is 3, but the minimum length of the part that
253 will actually match is 1. So 'FOO's minimum length is 3, but the
254 minimum length for the F is 1. This is important as the minimum length
255 is used to determine offsets in front of and behind the string being
256 looked for. Since strings can be composites this is the length of the
257 pattern at the time it was commited with a scan_commit. Note that
258 the length is calculated by study_chunk, so that the minimum lengths
259 are not known until the full pattern has been compiled, thus the
260 pointer to the value.
261
262 - lookbehind
263
264 In the case of lookbehind the string being searched for can be
265 offset past the start point of the final matching string.
266 If this value was just blithely removed from the min_offset it would
267 invalidate some of the calculations for how many chars must match
268 before or after (as they are derived from min_offset and minlen and
269 the length of the string being searched for).
270 When the final pattern is compiled and the data is moved from the
271 scan_data_t structure into the regexp structure the information
272 about lookbehind is factored in, with the information that would
273 have been lost precalculated in the end_shift field for the
274 associated string.
275
276 The fields pos_min and pos_delta are used to store the minimum offset
277 and the delta to the maximum offset at the current point in the pattern.
278
279*/
2c2d71f5
JH
280
281typedef struct scan_data_t {
1de06328
YO
282 /*I32 len_min; unused */
283 /*I32 len_delta; unused */
2c2d71f5
JH
284 I32 pos_min;
285 I32 pos_delta;
286 SV *last_found;
1de06328 287 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
288 I32 last_start_min;
289 I32 last_start_max;
1de06328
YO
290 SV **longest; /* Either &l_fixed, or &l_float. */
291 SV *longest_fixed; /* longest fixed string found in pattern */
292 I32 offset_fixed; /* offset where it starts */
293 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
294 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
295 SV *longest_float; /* longest floating string found in pattern */
296 I32 offset_float_min; /* earliest point in string it can appear */
297 I32 offset_float_max; /* latest point in string it can appear */
298 I32 *minlen_float; /* pointer to the minlen relevent to the string */
299 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
300 I32 flags;
301 I32 whilem_c;
cb434fcc 302 I32 *last_closep;
653099ff 303 struct regnode_charclass_class *start_class;
2c2d71f5
JH
304} scan_data_t;
305
a687059c 306/*
e50aee73 307 * Forward declarations for pregcomp()'s friends.
a687059c 308 */
a0d0e21e 309
27da23d5 310static const scan_data_t zero_scan_data =
1de06328 311 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
312
313#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
314#define SF_BEFORE_SEOL 0x0001
315#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
316#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
317#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
318
09b7f37c
CB
319#ifdef NO_UNARY_PLUS
320# define SF_FIX_SHIFT_EOL (0+2)
321# define SF_FL_SHIFT_EOL (0+4)
322#else
323# define SF_FIX_SHIFT_EOL (+2)
324# define SF_FL_SHIFT_EOL (+4)
325#endif
c277df42
IZ
326
327#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
328#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
329
330#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
331#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
332#define SF_IS_INF 0x0040
333#define SF_HAS_PAR 0x0080
334#define SF_IN_PAR 0x0100
335#define SF_HAS_EVAL 0x0200
336#define SCF_DO_SUBSTR 0x0400
653099ff
GS
337#define SCF_DO_STCLASS_AND 0x0800
338#define SCF_DO_STCLASS_OR 0x1000
339#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 340#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 341
786e8c11 342#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 343#define SCF_SEEN_ACCEPT 0x8000
07be1b83 344
eb160463 345#define UTF (RExC_utf8 != 0)
bbe252da
YO
346#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
347#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 348
ffc61ed2 349#define OOB_UNICODE 12345678
93733859 350#define OOB_NAMEDCLASS -1
b8c5462f 351
a0ed51b3
LW
352#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
353#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
354
8615cb43 355
b45f050a
JF
356/* length of regex to show in messages that don't mark a position within */
357#define RegexLengthToShowInErrorMessages 127
358
359/*
360 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
361 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
362 * op/pragma/warn/regcomp.
363 */
7253e4e3
RK
364#define MARKER1 "<-- HERE" /* marker as it appears in the description */
365#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 366
7253e4e3 367#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
368
369/*
370 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
371 * arg. Show regex, up to a maximum length. If it's too long, chop and add
372 * "...".
373 */
58e23c8d 374#define _FAIL(code) STMT_START { \
bfed75c6 375 const char *ellipses = ""; \
ccb2c380
MP
376 IV len = RExC_end - RExC_precomp; \
377 \
378 if (!SIZE_ONLY) \
379 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
380 if (len > RegexLengthToShowInErrorMessages) { \
381 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
382 len = RegexLengthToShowInErrorMessages - 10; \
383 ellipses = "..."; \
384 } \
58e23c8d 385 code; \
ccb2c380 386} STMT_END
8615cb43 387
58e23c8d
YO
388#define FAIL(msg) _FAIL( \
389 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
390 msg, (int)len, RExC_precomp, ellipses))
391
392#define FAIL2(msg,arg) _FAIL( \
393 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
394 arg, (int)len, RExC_precomp, ellipses))
395
b45f050a 396/*
b45f050a
JF
397 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
398 */
ccb2c380 399#define Simple_vFAIL(m) STMT_START { \
a28509cc 400 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
401 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
402 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
403} STMT_END
b45f050a
JF
404
405/*
406 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
407 */
ccb2c380
MP
408#define vFAIL(m) STMT_START { \
409 if (!SIZE_ONLY) \
410 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
411 Simple_vFAIL(m); \
412} STMT_END
b45f050a
JF
413
414/*
415 * Like Simple_vFAIL(), but accepts two arguments.
416 */
ccb2c380 417#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 418 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
419 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
420 (int)offset, RExC_precomp, RExC_precomp + offset); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
425 */
ccb2c380
MP
426#define vFAIL2(m,a1) STMT_START { \
427 if (!SIZE_ONLY) \
428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
429 Simple_vFAIL2(m, a1); \
430} STMT_END
b45f050a
JF
431
432
433/*
434 * Like Simple_vFAIL(), but accepts three arguments.
435 */
ccb2c380 436#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 437 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
438 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
439 (int)offset, RExC_precomp, RExC_precomp + offset); \
440} STMT_END
b45f050a
JF
441
442/*
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
444 */
ccb2c380
MP
445#define vFAIL3(m,a1,a2) STMT_START { \
446 if (!SIZE_ONLY) \
447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
448 Simple_vFAIL3(m, a1, a2); \
449} STMT_END
b45f050a
JF
450
451/*
452 * Like Simple_vFAIL(), but accepts four arguments.
453 */
ccb2c380 454#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a 459
ccb2c380 460#define vWARN(loc,m) STMT_START { \
a28509cc 461 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
462 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
463 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
465
466#define vWARNdep(loc,m) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
469 "%s" REPORT_LOCATION, \
470 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
471} STMT_END
472
473
474#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 475 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
476 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
477 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 487 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
488 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
490} STMT_END
491
492#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
9d1d55b5 497
8615cb43 498
cd439c50 499/* Allow for side effects in s */
ccb2c380
MP
500#define REGC(c,s) STMT_START { \
501 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
502} STMT_END
cd439c50 503
fac92740
MJD
504/* Macros for recording node offsets. 20001227 mjd@plover.com
505 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
506 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
507 * Element 0 holds the number n.
07be1b83 508 * Position is 1 indexed.
fac92740
MJD
509 */
510
ccb2c380
MP
511#define Set_Node_Offset_To_R(node,byte) STMT_START { \
512 if (! SIZE_ONLY) { \
513 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 514 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 515 if((node) < 0) { \
551405c4 516 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
517 } else { \
518 RExC_offsets[2*(node)-1] = (byte); \
519 } \
520 } \
521} STMT_END
522
523#define Set_Node_Offset(node,byte) \
524 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
525#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
526
527#define Set_Node_Length_To_R(node,len) STMT_START { \
528 if (! SIZE_ONLY) { \
529 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 530 __LINE__, (int)(node), (int)(len))); \
ccb2c380 531 if((node) < 0) { \
551405c4 532 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
533 } else { \
534 RExC_offsets[2*(node)] = (len); \
535 } \
536 } \
537} STMT_END
538
539#define Set_Node_Length(node,len) \
540 Set_Node_Length_To_R((node)-RExC_emit_start, len)
541#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
542#define Set_Node_Cur_Length(node) \
543 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
544
545/* Get offsets and lengths */
546#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
547#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
548
07be1b83
YO
549#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
550 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
551 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
552} STMT_END
553
554
555#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
556#define EXPERIMENTAL_INPLACESCAN
557#endif
558
304ee84b
YO
559#define DEBUG_STUDYDATA(str,data,depth) \
560DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 561 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
562 "%*s" str "Pos:%"IVdf"/%"IVdf \
563 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
564 (int)(depth)*2, "", \
565 (IV)((data)->pos_min), \
566 (IV)((data)->pos_delta), \
304ee84b 567 (UV)((data)->flags), \
1de06328 568 (IV)((data)->whilem_c), \
304ee84b
YO
569 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
570 is_inf ? "INF " : "" \
1de06328
YO
571 ); \
572 if ((data)->last_found) \
573 PerlIO_printf(Perl_debug_log, \
574 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
575 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
576 SvPVX_const((data)->last_found), \
577 (IV)((data)->last_end), \
578 (IV)((data)->last_start_min), \
579 (IV)((data)->last_start_max), \
580 ((data)->longest && \
581 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
582 SvPVX_const((data)->longest_fixed), \
583 (IV)((data)->offset_fixed), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_float)) ? "*" : "", \
586 SvPVX_const((data)->longest_float), \
587 (IV)((data)->offset_float_min), \
588 (IV)((data)->offset_float_max) \
589 ); \
590 PerlIO_printf(Perl_debug_log,"\n"); \
591});
592
acfe0abc 593static void clear_re(pTHX_ void *r);
4327152a 594
653099ff 595/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 596 Update the longest found anchored substring and the longest found
653099ff
GS
597 floating substrings if needed. */
598
4327152a 599STATIC void
304ee84b 600S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 601{
e1ec3a88
AL
602 const STRLEN l = CHR_SVLEN(data->last_found);
603 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 604 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 605
c277df42 606 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 607 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
608 if (*data->longest == data->longest_fixed) {
609 data->offset_fixed = l ? data->last_start_min : data->pos_min;
610 if (data->flags & SF_BEFORE_EOL)
b81d288d 611 data->flags
c277df42
IZ
612 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
613 else
614 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
615 data->minlen_fixed=minlenp;
616 data->lookbehind_fixed=0;
a0ed51b3 617 }
304ee84b 618 else { /* *data->longest == data->longest_float */
c277df42 619 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
620 data->offset_float_max = (l
621 ? data->last_start_max
c277df42 622 : data->pos_min + data->pos_delta);
304ee84b 623 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 624 data->offset_float_max = I32_MAX;
c277df42 625 if (data->flags & SF_BEFORE_EOL)
b81d288d 626 data->flags
c277df42
IZ
627 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
628 else
629 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
630 data->minlen_float=minlenp;
631 data->lookbehind_float=0;
c277df42
IZ
632 }
633 }
634 SvCUR_set(data->last_found, 0);
0eda9292 635 {
a28509cc 636 SV * const sv = data->last_found;
097eb12c
AL
637 if (SvUTF8(sv) && SvMAGICAL(sv)) {
638 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
639 if (mg)
640 mg->mg_len = 0;
641 }
0eda9292 642 }
c277df42
IZ
643 data->last_end = -1;
644 data->flags &= ~SF_BEFORE_EOL;
304ee84b 645 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
646}
647
653099ff
GS
648/* Can match anything (initialization) */
649STATIC void
097eb12c 650S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
653099ff 652 ANYOF_CLASS_ZERO(cl);
f8bef550 653 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 654 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* Can match anything (initialization) */
660STATIC int
5f66b61c 661S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
662{
663 int value;
664
aaa51d5e 665 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
666 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
667 return 1;
1aa99e6b
IH
668 if (!(cl->flags & ANYOF_UNICODE_ALL))
669 return 0;
10edeb5d 670 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 671 return 0;
653099ff
GS
672 return 1;
673}
674
675/* Can match anything (initialization) */
676STATIC void
097eb12c 677S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 678{
8ecf7187 679 Zero(cl, 1, struct regnode_charclass_class);
653099ff 680 cl->type = ANYOF;
830247a4 681 cl_anything(pRExC_state, cl);
653099ff
GS
682}
683
684STATIC void
097eb12c 685S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 686{
8ecf7187 687 Zero(cl, 1, struct regnode_charclass_class);
653099ff 688 cl->type = ANYOF;
830247a4 689 cl_anything(pRExC_state, cl);
653099ff
GS
690 if (LOC)
691 cl->flags |= ANYOF_LOCALE;
692}
693
694/* 'And' a given class with another one. Can create false positives */
695/* We assume that cl is not inverted */
696STATIC void
5f66b61c 697S_cl_and(struct regnode_charclass_class *cl,
a28509cc 698 const struct regnode_charclass_class *and_with)
653099ff 699{
40d049e4
YO
700
701 assert(and_with->type == ANYOF);
653099ff
GS
702 if (!(and_with->flags & ANYOF_CLASS)
703 && !(cl->flags & ANYOF_CLASS)
704 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
705 && !(and_with->flags & ANYOF_FOLD)
706 && !(cl->flags & ANYOF_FOLD)) {
707 int i;
708
709 if (and_with->flags & ANYOF_INVERT)
710 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
711 cl->bitmap[i] &= ~and_with->bitmap[i];
712 else
713 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
714 cl->bitmap[i] &= and_with->bitmap[i];
715 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
716 if (!(and_with->flags & ANYOF_EOS))
717 cl->flags &= ~ANYOF_EOS;
1aa99e6b 718
14ebb1a2
JH
719 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
720 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
721 cl->flags &= ~ANYOF_UNICODE_ALL;
722 cl->flags |= ANYOF_UNICODE;
723 ARG_SET(cl, ARG(and_with));
724 }
14ebb1a2
JH
725 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
726 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 727 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
728 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
729 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 730 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
731}
732
733/* 'OR' a given class with another one. Can create false positives */
734/* We assume that cl is not inverted */
735STATIC void
097eb12c 736S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 737{
653099ff
GS
738 if (or_with->flags & ANYOF_INVERT) {
739 /* We do not use
740 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
741 * <= (B1 | !B2) | (CL1 | !CL2)
742 * which is wasteful if CL2 is small, but we ignore CL2:
743 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
744 * XXXX Can we handle case-fold? Unclear:
745 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
746 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
747 */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
749 && !(or_with->flags & ANYOF_FOLD)
750 && !(cl->flags & ANYOF_FOLD) ) {
751 int i;
752
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] |= ~or_with->bitmap[i];
755 } /* XXXX: logic is complicated otherwise */
756 else {
830247a4 757 cl_anything(pRExC_state, cl);
653099ff
GS
758 }
759 } else {
760 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
761 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 762 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
763 || (cl->flags & ANYOF_FOLD)) ) {
764 int i;
765
766 /* OR char bitmap and class bitmap separately */
767 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
768 cl->bitmap[i] |= or_with->bitmap[i];
769 if (or_with->flags & ANYOF_CLASS) {
770 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
771 cl->classflags[i] |= or_with->classflags[i];
772 cl->flags |= ANYOF_CLASS;
773 }
774 }
775 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 776 cl_anything(pRExC_state, cl);
653099ff
GS
777 }
778 }
779 if (or_with->flags & ANYOF_EOS)
780 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
781
782 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
783 ARG(cl) != ARG(or_with)) {
784 cl->flags |= ANYOF_UNICODE_ALL;
785 cl->flags &= ~ANYOF_UNICODE;
786 }
787 if (or_with->flags & ANYOF_UNICODE_ALL) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
653099ff
GS
791}
792
a3621e74
YO
793#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
794#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
795#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
796#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
797
3dab1dad
YO
798
799#ifdef DEBUGGING
07be1b83 800/*
2b8b4781
NC
801 dump_trie(trie,widecharmap,revcharmap)
802 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
803 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
804
805 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
806 The _interim_ variants are used for debugging the interim
807 tables that are used to generate the final compressed
808 representation which is what dump_trie expects.
809
3dab1dad
YO
810 Part of the reason for their existance is to provide a form
811 of documentation as to how the different representations function.
07be1b83
YO
812
813*/
3dab1dad
YO
814
815/*
3dab1dad
YO
816 Dumps the final compressed table form of the trie to Perl_debug_log.
817 Used for debugging make_trie().
818*/
819
820STATIC void
2b8b4781
NC
821S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
822 AV *revcharmap, U32 depth)
3dab1dad
YO
823{
824 U32 state;
ab3bbdeb 825 SV *sv=sv_newmortal();
55eed653 826 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
827 GET_RE_DEBUG_FLAGS_DECL;
828
ab3bbdeb 829
3dab1dad
YO
830 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
831 (int)depth * 2 + 2,"",
832 "Match","Base","Ofs" );
833
834 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 835 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 836 if ( tmp ) {
ab3bbdeb
YO
837 PerlIO_printf( Perl_debug_log, "%*s",
838 colwidth,
ddc5bc0f 839 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
840 PL_colors[0], PL_colors[1],
841 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
842 PERL_PV_ESCAPE_FIRSTCHAR
843 )
844 );
3dab1dad
YO
845 }
846 }
847 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
848 (int)depth * 2 + 2,"");
849
850 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
852 PerlIO_printf( Perl_debug_log, "\n");
853
1e2e3d02 854 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 855 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
856
857 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
858
859 if ( trie->states[ state ].wordnum ) {
860 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
861 } else {
862 PerlIO_printf( Perl_debug_log, "%6s", "" );
863 }
864
865 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
866
867 if ( base ) {
868 U32 ofs = 0;
869
870 while( ( base + ofs < trie->uniquecharcount ) ||
871 ( base + ofs - trie->uniquecharcount < trie->lasttrans
872 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
873 ofs++;
874
875 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
876
877 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
878 if ( ( base + ofs >= trie->uniquecharcount ) &&
879 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
880 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
881 {
ab3bbdeb
YO
882 PerlIO_printf( Perl_debug_log, "%*"UVXf,
883 colwidth,
3dab1dad
YO
884 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
885 } else {
ab3bbdeb 886 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
887 }
888 }
889
890 PerlIO_printf( Perl_debug_log, "]");
891
892 }
893 PerlIO_printf( Perl_debug_log, "\n" );
894 }
895}
896/*
3dab1dad
YO
897 Dumps a fully constructed but uncompressed trie in list form.
898 List tries normally only are used for construction when the number of
899 possible chars (trie->uniquecharcount) is very high.
900 Used for debugging make_trie().
901*/
902STATIC void
55eed653 903S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
904 HV *widecharmap, AV *revcharmap, U32 next_alloc,
905 U32 depth)
3dab1dad
YO
906{
907 U32 state;
ab3bbdeb 908 SV *sv=sv_newmortal();
55eed653 909 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
910 GET_RE_DEBUG_FLAGS_DECL;
911 /* print out the table precompression. */
ab3bbdeb
YO
912 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
913 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
914 "------:-----+-----------------\n" );
3dab1dad
YO
915
916 for( state=1 ; state < next_alloc ; state ++ ) {
917 U16 charid;
918
ab3bbdeb 919 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
920 (int)depth * 2 + 2,"", (UV)state );
921 if ( ! trie->states[ state ].wordnum ) {
922 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 } else {
924 PerlIO_printf( Perl_debug_log, "W%4x| ",
925 trie->states[ state ].wordnum
926 );
927 }
928 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 929 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
930 if ( tmp ) {
931 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
932 colwidth,
ddc5bc0f 933 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
934 PL_colors[0], PL_colors[1],
935 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
936 PERL_PV_ESCAPE_FIRSTCHAR
937 ) ,
1e2e3d02
YO
938 TRIE_LIST_ITEM(state,charid).forid,
939 (UV)TRIE_LIST_ITEM(state,charid).newstate
940 );
941 if (!(charid % 10))
664e119d
RGS
942 PerlIO_printf(Perl_debug_log, "\n%*s| ",
943 (int)((depth * 2) + 14), "");
1e2e3d02 944 }
ab3bbdeb
YO
945 }
946 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
947 }
948}
949
950/*
3dab1dad
YO
951 Dumps a fully constructed but uncompressed trie in table form.
952 This is the normal DFA style state transition table, with a few
953 twists to facilitate compression later.
954 Used for debugging make_trie().
955*/
956STATIC void
55eed653 957S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
958 HV *widecharmap, AV *revcharmap, U32 next_alloc,
959 U32 depth)
3dab1dad
YO
960{
961 U32 state;
962 U16 charid;
ab3bbdeb 963 SV *sv=sv_newmortal();
55eed653 964 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
965 GET_RE_DEBUG_FLAGS_DECL;
966
967 /*
968 print out the table precompression so that we can do a visual check
969 that they are identical.
970 */
971
972 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
973
974 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 975 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 976 if ( tmp ) {
ab3bbdeb
YO
977 PerlIO_printf( Perl_debug_log, "%*s",
978 colwidth,
ddc5bc0f 979 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
980 PL_colors[0], PL_colors[1],
981 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
982 PERL_PV_ESCAPE_FIRSTCHAR
983 )
984 );
3dab1dad
YO
985 }
986 }
987
988 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
989
990 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 991 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
992 }
993
994 PerlIO_printf( Perl_debug_log, "\n" );
995
996 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
997
998 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
999 (int)depth * 2 + 2,"",
1000 (UV)TRIE_NODENUM( state ) );
1001
1002 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1003 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1004 if (v)
1005 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1006 else
1007 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1008 }
1009 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1011 } else {
1012 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1013 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1014 }
1015 }
07be1b83 1016}
3dab1dad
YO
1017
1018#endif
1019
786e8c11
YO
1020/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1021 startbranch: the first branch in the whole branch sequence
1022 first : start branch of sequence of branch-exact nodes.
1023 May be the same as startbranch
1024 last : Thing following the last branch.
1025 May be the same as tail.
1026 tail : item following the branch sequence
1027 count : words in the sequence
1028 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1029 depth : indent depth
3dab1dad 1030
786e8c11 1031Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1032
786e8c11
YO
1033A trie is an N'ary tree where the branches are determined by digital
1034decomposition of the key. IE, at the root node you look up the 1st character and
1035follow that branch repeat until you find the end of the branches. Nodes can be
1036marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1037
786e8c11 1038 /he|she|his|hers/
72f13be8 1039
786e8c11
YO
1040would convert into the following structure. Numbers represent states, letters
1041following numbers represent valid transitions on the letter from that state, if
1042the number is in square brackets it represents an accepting state, otherwise it
1043will be in parenthesis.
07be1b83 1044
786e8c11
YO
1045 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1046 | |
1047 | (2)
1048 | |
1049 (1) +-i->(6)-+-s->[7]
1050 |
1051 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1052
786e8c11
YO
1053 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1054
1055This shows that when matching against the string 'hers' we will begin at state 1
1056read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1057then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1058is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1059single traverse. We store a mapping from accepting to state to which word was
1060matched, and then when we have multiple possibilities we try to complete the
1061rest of the regex in the order in which they occured in the alternation.
1062
1063The only prior NFA like behaviour that would be changed by the TRIE support is
1064the silent ignoring of duplicate alternations which are of the form:
1065
1066 / (DUPE|DUPE) X? (?{ ... }) Y /x
1067
1068Thus EVAL blocks follwing a trie may be called a different number of times with
1069and without the optimisation. With the optimisations dupes will be silently
1070ignored. This inconsistant behaviour of EVAL type nodes is well established as
1071the following demonstrates:
1072
1073 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1074
1075which prints out 'word' three times, but
1076
1077 'words'=~/(word|word|word)(?{ print $1 })S/
1078
1079which doesnt print it out at all. This is due to other optimisations kicking in.
1080
1081Example of what happens on a structural level:
1082
1083The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1084
1085 1: CURLYM[1] {1,32767}(18)
1086 5: BRANCH(8)
1087 6: EXACT <ac>(16)
1088 8: BRANCH(11)
1089 9: EXACT <ad>(16)
1090 11: BRANCH(14)
1091 12: EXACT <ab>(16)
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096This would be optimizable with startbranch=5, first=5, last=16, tail=16
1097and should turn into:
1098
1099 1: CURLYM[1] {1,32767}(18)
1100 5: TRIE(16)
1101 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1102 <ac>
1103 <ad>
1104 <ab>
1105 16: SUCCEED(0)
1106 17: NOTHING(18)
1107 18: END(0)
1108
1109Cases where tail != last would be like /(?foo|bar)baz/:
1110
1111 1: BRANCH(4)
1112 2: EXACT <foo>(8)
1113 4: BRANCH(7)
1114 5: EXACT <bar>(8)
1115 7: TAIL(8)
1116 8: EXACT <baz>(10)
1117 10: END(0)
1118
1119which would be optimizable with startbranch=1, first=1, last=7, tail=8
1120and would end up looking like:
1121
1122 1: TRIE(8)
1123 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1124 <foo>
1125 <bar>
1126 7: TAIL(8)
1127 8: EXACT <baz>(10)
1128 10: END(0)
1129
1130 d = uvuni_to_utf8_flags(d, uv, 0);
1131
1132is the recommended Unicode-aware way of saying
1133
1134 *(d++) = uv;
1135*/
1136
1e2e3d02 1137#define TRIE_STORE_REVCHAR \
786e8c11 1138 STMT_START { \
1e2e3d02 1139 SV *tmp = newSVpvs(""); \
786e8c11 1140 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1141 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1142 av_push( revcharmap, tmp ); \
786e8c11
YO
1143 } STMT_END
1144
1145#define TRIE_READ_CHAR STMT_START { \
1146 wordlen++; \
1147 if ( UTF ) { \
1148 if ( folder ) { \
1149 if ( foldlen > 0 ) { \
1150 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1151 foldlen -= len; \
1152 scan += len; \
1153 len = 0; \
1154 } else { \
1155 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1156 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1157 foldlen -= UNISKIP( uvc ); \
1158 scan = foldbuf + UNISKIP( uvc ); \
1159 } \
1160 } else { \
1161 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1162 } \
1163 } else { \
1164 uvc = (U32)*uc; \
1165 len = 1; \
1166 } \
1167} STMT_END
1168
1169
1170
1171#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1172 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1173 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1174 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1175 } \
1176 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1177 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1178 TRIE_LIST_CUR( state )++; \
1179} STMT_END
07be1b83 1180
786e8c11
YO
1181#define TRIE_LIST_NEW(state) STMT_START { \
1182 Newxz( trie->states[ state ].trans.list, \
1183 4, reg_trie_trans_le ); \
1184 TRIE_LIST_CUR( state ) = 1; \
1185 TRIE_LIST_LEN( state ) = 4; \
1186} STMT_END
07be1b83 1187
786e8c11
YO
1188#define TRIE_HANDLE_WORD(state) STMT_START { \
1189 U16 dupe= trie->states[ state ].wordnum; \
1190 regnode * const noper_next = regnext( noper ); \
1191 \
1192 if (trie->wordlen) \
1193 trie->wordlen[ curword ] = wordlen; \
1194 DEBUG_r({ \
1195 /* store the word for dumping */ \
1196 SV* tmp; \
1197 if (OP(noper) != NOTHING) \
1198 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1199 else \
1200 tmp = newSVpvn( "", 0 ); \
1201 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1202 av_push( trie_words, tmp ); \
786e8c11
YO
1203 }); \
1204 \
1205 curword++; \
1206 \
1207 if ( noper_next < tail ) { \
1208 if (!trie->jump) \
c944940b 1209 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1210 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1211 if (!jumper) \
1212 jumper = noper_next; \
1213 if (!nextbranch) \
1214 nextbranch= regnext(cur); \
1215 } \
1216 \
1217 if ( dupe ) { \
1218 /* So it's a dupe. This means we need to maintain a */\
1219 /* linked-list from the first to the next. */\
1220 /* we only allocate the nextword buffer when there */\
1221 /* a dupe, so first time we have to do the allocation */\
1222 if (!trie->nextword) \
c944940b 1223 trie->nextword = (U16 *) \
446bd890 1224 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1225 while ( trie->nextword[dupe] ) \
1226 dupe= trie->nextword[dupe]; \
1227 trie->nextword[dupe]= curword; \
1228 } else { \
1229 /* we haven't inserted this word yet. */ \
1230 trie->states[ state ].wordnum = curword; \
1231 } \
1232} STMT_END
07be1b83 1233
3dab1dad 1234
786e8c11
YO
1235#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1236 ( ( base + charid >= ucharcount \
1237 && base + charid < ubound \
1238 && state == trie->trans[ base - ucharcount + charid ].check \
1239 && trie->trans[ base - ucharcount + charid ].next ) \
1240 ? trie->trans[ base - ucharcount + charid ].next \
1241 : ( state==1 ? special : 0 ) \
1242 )
3dab1dad 1243
786e8c11
YO
1244#define MADE_TRIE 1
1245#define MADE_JUMP_TRIE 2
1246#define MADE_EXACT_TRIE 4
3dab1dad 1247
a3621e74 1248STATIC I32
786e8c11 1249S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1250{
27da23d5 1251 dVAR;
a3621e74
YO
1252 /* first pass, loop through and scan words */
1253 reg_trie_data *trie;
55eed653 1254 HV *widecharmap = NULL;
2b8b4781 1255 AV *revcharmap = newAV();
a3621e74 1256 regnode *cur;
9f7f3913 1257 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1258 STRLEN len = 0;
1259 UV uvc = 0;
1260 U16 curword = 0;
1261 U32 next_alloc = 0;
786e8c11
YO
1262 regnode *jumper = NULL;
1263 regnode *nextbranch = NULL;
7f69552c 1264 regnode *convert = NULL;
a3621e74 1265 /* we just use folder as a flag in utf8 */
e1ec3a88 1266 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1267 ? PL_fold
1268 : ( flags == EXACTFL
1269 ? PL_fold_locale
1270 : NULL
1271 )
1272 );
1273
2b8b4781
NC
1274#ifdef DEBUGGING
1275 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1276 AV *trie_words = NULL;
1277 /* along with revcharmap, this only used during construction but both are
1278 * useful during debugging so we store them in the struct when debugging.
8e11feef 1279 */
2b8b4781
NC
1280#else
1281 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1282 STRLEN trie_charcount=0;
3dab1dad 1283#endif
2b8b4781 1284 SV *re_trie_maxbuff;
a3621e74 1285 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1286#ifndef DEBUGGING
1287 PERL_UNUSED_ARG(depth);
1288#endif
a3621e74 1289
c944940b 1290 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1291 trie->refcount = 1;
3dab1dad 1292 trie->startstate = 1;
786e8c11 1293 trie->wordcount = word_count;
f8fc2ecf 1294 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1295 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1296 if (!(UTF && folder))
c944940b 1297 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1298 DEBUG_r({
2b8b4781 1299 trie_words = newAV();
a3621e74 1300 });
a3621e74 1301
0111c4fd 1302 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1303 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1304 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1305 }
3dab1dad
YO
1306 DEBUG_OPTIMISE_r({
1307 PerlIO_printf( Perl_debug_log,
786e8c11 1308 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1309 (int)depth * 2 + 2, "",
1310 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1311 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1312 (int)depth);
3dab1dad 1313 });
7f69552c
YO
1314
1315 /* Find the node we are going to overwrite */
1316 if ( first == startbranch && OP( last ) != BRANCH ) {
1317 /* whole branch chain */
1318 convert = first;
1319 } else {
1320 /* branch sub-chain */
1321 convert = NEXTOPER( first );
1322 }
1323
a3621e74
YO
1324 /* -- First loop and Setup --
1325
1326 We first traverse the branches and scan each word to determine if it
1327 contains widechars, and how many unique chars there are, this is
1328 important as we have to build a table with at least as many columns as we
1329 have unique chars.
1330
1331 We use an array of integers to represent the character codes 0..255
1332 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1333 native representation of the character value as the key and IV's for the
1334 coded index.
1335
1336 *TODO* If we keep track of how many times each character is used we can
1337 remap the columns so that the table compression later on is more
1338 efficient in terms of memory by ensuring most common value is in the
1339 middle and the least common are on the outside. IMO this would be better
1340 than a most to least common mapping as theres a decent chance the most
1341 common letter will share a node with the least common, meaning the node
1342 will not be compressable. With a middle is most common approach the worst
1343 case is when we have the least common nodes twice.
1344
1345 */
1346
a3621e74 1347 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1348 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1349 const U8 *uc = (U8*)STRING( noper );
a28509cc 1350 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1351 STRLEN foldlen = 0;
1352 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1353 const U8 *scan = (U8*)NULL;
07be1b83 1354 U32 wordlen = 0; /* required init */
3dab1dad 1355 STRLEN chars=0;
a3621e74 1356
3dab1dad
YO
1357 if (OP(noper) == NOTHING) {
1358 trie->minlen= 0;
1359 continue;
1360 }
1361 if (trie->bitmap) {
1362 TRIE_BITMAP_SET(trie,*uc);
1363 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1364 }
a3621e74 1365 for ( ; uc < e ; uc += len ) {
3dab1dad 1366 TRIE_CHARCOUNT(trie)++;
a3621e74 1367 TRIE_READ_CHAR;
3dab1dad 1368 chars++;
a3621e74
YO
1369 if ( uvc < 256 ) {
1370 if ( !trie->charmap[ uvc ] ) {
1371 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1372 if ( folder )
1373 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1374 TRIE_STORE_REVCHAR;
a3621e74
YO
1375 }
1376 } else {
1377 SV** svpp;
55eed653
NC
1378 if ( !widecharmap )
1379 widecharmap = newHV();
a3621e74 1380
55eed653 1381 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1382
1383 if ( !svpp )
e4584336 1384 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1385
1386 if ( !SvTRUE( *svpp ) ) {
1387 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1388 TRIE_STORE_REVCHAR;
a3621e74
YO
1389 }
1390 }
1391 }
3dab1dad
YO
1392 if( cur == first ) {
1393 trie->minlen=chars;
1394 trie->maxlen=chars;
1395 } else if (chars < trie->minlen) {
1396 trie->minlen=chars;
1397 } else if (chars > trie->maxlen) {
1398 trie->maxlen=chars;
1399 }
1400
a3621e74
YO
1401 } /* end first pass */
1402 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1403 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1404 (int)depth * 2 + 2,"",
55eed653 1405 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1406 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1407 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1408 );
c944940b 1409 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1410
1411 /*
1412 We now know what we are dealing with in terms of unique chars and
1413 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1414 representation using a flat table will take. If it's over a reasonable
1415 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1416 conservative but potentially much slower representation using an array
1417 of lists.
1418
1419 At the end we convert both representations into the same compressed
1420 form that will be used in regexec.c for matching with. The latter
1421 is a form that cannot be used to construct with but has memory
1422 properties similar to the list form and access properties similar
1423 to the table form making it both suitable for fast searches and
1424 small enough that its feasable to store for the duration of a program.
1425
1426 See the comment in the code where the compressed table is produced
1427 inplace from the flat tabe representation for an explanation of how
1428 the compression works.
1429
1430 */
1431
1432
3dab1dad 1433 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1434 /*
1435 Second Pass -- Array Of Lists Representation
1436
1437 Each state will be represented by a list of charid:state records
1438 (reg_trie_trans_le) the first such element holds the CUR and LEN
1439 points of the allocated array. (See defines above).
1440
1441 We build the initial structure using the lists, and then convert
1442 it into the compressed table form which allows faster lookups
1443 (but cant be modified once converted).
a3621e74
YO
1444 */
1445
a3621e74
YO
1446 STRLEN transcount = 1;
1447
1e2e3d02
YO
1448 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1449 "%*sCompiling trie using list compiler\n",
1450 (int)depth * 2 + 2, ""));
446bd890 1451
c944940b
JH
1452 trie->states = (reg_trie_state *)
1453 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1454 sizeof(reg_trie_state) );
a3621e74
YO
1455 TRIE_LIST_NEW(1);
1456 next_alloc = 2;
1457
1458 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1459
c445ea15
AL
1460 regnode * const noper = NEXTOPER( cur );
1461 U8 *uc = (U8*)STRING( noper );
1462 const U8 * const e = uc + STR_LEN( noper );
1463 U32 state = 1; /* required init */
1464 U16 charid = 0; /* sanity init */
1465 U8 *scan = (U8*)NULL; /* sanity init */
1466 STRLEN foldlen = 0; /* required init */
07be1b83 1467 U32 wordlen = 0; /* required init */
c445ea15
AL
1468 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1469
3dab1dad 1470 if (OP(noper) != NOTHING) {
786e8c11 1471 for ( ; uc < e ; uc += len ) {
c445ea15 1472
786e8c11 1473 TRIE_READ_CHAR;
c445ea15 1474
786e8c11
YO
1475 if ( uvc < 256 ) {
1476 charid = trie->charmap[ uvc ];
c445ea15 1477 } else {
55eed653 1478 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1479 if ( !svpp ) {
1480 charid = 0;
1481 } else {
1482 charid=(U16)SvIV( *svpp );
1483 }
c445ea15 1484 }
786e8c11
YO
1485 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1486 if ( charid ) {
a3621e74 1487
786e8c11
YO
1488 U16 check;
1489 U32 newstate = 0;
a3621e74 1490
786e8c11
YO
1491 charid--;
1492 if ( !trie->states[ state ].trans.list ) {
1493 TRIE_LIST_NEW( state );
c445ea15 1494 }
786e8c11
YO
1495 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1496 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1497 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1498 break;
1499 }
1500 }
1501 if ( ! newstate ) {
1502 newstate = next_alloc++;
1503 TRIE_LIST_PUSH( state, charid, newstate );
1504 transcount++;
1505 }
1506 state = newstate;
1507 } else {
1508 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1509 }
a28509cc 1510 }
c445ea15 1511 }
3dab1dad 1512 TRIE_HANDLE_WORD(state);
a3621e74
YO
1513
1514 } /* end second pass */
1515
1e2e3d02
YO
1516 /* next alloc is the NEXT state to be allocated */
1517 trie->statecount = next_alloc;
c944940b
JH
1518 trie->states = (reg_trie_state *)
1519 PerlMemShared_realloc( trie->states,
1520 next_alloc
1521 * sizeof(reg_trie_state) );
a3621e74 1522
3dab1dad 1523 /* and now dump it out before we compress it */
2b8b4781
NC
1524 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1525 revcharmap, next_alloc,
1526 depth+1)
1e2e3d02 1527 );
a3621e74 1528
c944940b
JH
1529 trie->trans = (reg_trie_trans *)
1530 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1531 {
1532 U32 state;
a3621e74
YO
1533 U32 tp = 0;
1534 U32 zp = 0;
1535
1536
1537 for( state=1 ; state < next_alloc ; state ++ ) {
1538 U32 base=0;
1539
1540 /*
1541 DEBUG_TRIE_COMPILE_MORE_r(
1542 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1543 );
1544 */
1545
1546 if (trie->states[state].trans.list) {
1547 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1548 U16 maxid=minid;
a28509cc 1549 U16 idx;
a3621e74
YO
1550
1551 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1552 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1553 if ( forid < minid ) {
1554 minid=forid;
1555 } else if ( forid > maxid ) {
1556 maxid=forid;
1557 }
a3621e74
YO
1558 }
1559 if ( transcount < tp + maxid - minid + 1) {
1560 transcount *= 2;
c944940b
JH
1561 trie->trans = (reg_trie_trans *)
1562 PerlMemShared_realloc( trie->trans,
446bd890
NC
1563 transcount
1564 * sizeof(reg_trie_trans) );
a3621e74
YO
1565 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1566 }
1567 base = trie->uniquecharcount + tp - minid;
1568 if ( maxid == minid ) {
1569 U32 set = 0;
1570 for ( ; zp < tp ; zp++ ) {
1571 if ( ! trie->trans[ zp ].next ) {
1572 base = trie->uniquecharcount + zp - minid;
1573 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1574 trie->trans[ zp ].check = state;
1575 set = 1;
1576 break;
1577 }
1578 }
1579 if ( !set ) {
1580 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1581 trie->trans[ tp ].check = state;
1582 tp++;
1583 zp = tp;
1584 }
1585 } else {
1586 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1587 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1588 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1589 trie->trans[ tid ].check = state;
1590 }
1591 tp += ( maxid - minid + 1 );
1592 }
1593 Safefree(trie->states[ state ].trans.list);
1594 }
1595 /*
1596 DEBUG_TRIE_COMPILE_MORE_r(
1597 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1598 );
1599 */
1600 trie->states[ state ].trans.base=base;
1601 }
cc601c31 1602 trie->lasttrans = tp + 1;
a3621e74
YO
1603 }
1604 } else {
1605 /*
1606 Second Pass -- Flat Table Representation.
1607
1608 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1609 We know that we will need Charcount+1 trans at most to store the data
1610 (one row per char at worst case) So we preallocate both structures
1611 assuming worst case.
1612
1613 We then construct the trie using only the .next slots of the entry
1614 structs.
1615
1616 We use the .check field of the first entry of the node temporarily to
1617 make compression both faster and easier by keeping track of how many non
1618 zero fields are in the node.
1619
1620 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1621 transition.
1622
1623 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1624 number representing the first entry of the node, and state as a
1625 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1626 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1627 are 2 entrys per node. eg:
1628
1629 A B A B
1630 1. 2 4 1. 3 7
1631 2. 0 3 3. 0 5
1632 3. 0 0 5. 0 0
1633 4. 0 0 7. 0 0
1634
1635 The table is internally in the right hand, idx form. However as we also
1636 have to deal with the states array which is indexed by nodenum we have to
1637 use TRIE_NODENUM() to convert.
1638
1639 */
1e2e3d02
YO
1640 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1641 "%*sCompiling trie using table compiler\n",
1642 (int)depth * 2 + 2, ""));
3dab1dad 1643
c944940b
JH
1644 trie->trans = (reg_trie_trans *)
1645 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1646 * trie->uniquecharcount + 1,
1647 sizeof(reg_trie_trans) );
1648 trie->states = (reg_trie_state *)
1649 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1650 sizeof(reg_trie_state) );
a3621e74
YO
1651 next_alloc = trie->uniquecharcount + 1;
1652
3dab1dad 1653
a3621e74
YO
1654 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1655
c445ea15 1656 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1657 const U8 *uc = (U8*)STRING( noper );
1658 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1659
1660 U32 state = 1; /* required init */
1661
1662 U16 charid = 0; /* sanity init */
1663 U32 accept_state = 0; /* sanity init */
1664 U8 *scan = (U8*)NULL; /* sanity init */
1665
1666 STRLEN foldlen = 0; /* required init */
07be1b83 1667 U32 wordlen = 0; /* required init */
a3621e74
YO
1668 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1669
3dab1dad 1670 if ( OP(noper) != NOTHING ) {
786e8c11 1671 for ( ; uc < e ; uc += len ) {
a3621e74 1672
786e8c11 1673 TRIE_READ_CHAR;
a3621e74 1674
786e8c11
YO
1675 if ( uvc < 256 ) {
1676 charid = trie->charmap[ uvc ];
1677 } else {
55eed653 1678 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1679 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1680 }
786e8c11
YO
1681 if ( charid ) {
1682 charid--;
1683 if ( !trie->trans[ state + charid ].next ) {
1684 trie->trans[ state + charid ].next = next_alloc;
1685 trie->trans[ state ].check++;
1686 next_alloc += trie->uniquecharcount;
1687 }
1688 state = trie->trans[ state + charid ].next;
1689 } else {
1690 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1691 }
1692 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1693 }
a3621e74 1694 }
3dab1dad
YO
1695 accept_state = TRIE_NODENUM( state );
1696 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1697
1698 } /* end second pass */
1699
3dab1dad 1700 /* and now dump it out before we compress it */
2b8b4781
NC
1701 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1702 revcharmap,
1703 next_alloc, depth+1));
a3621e74 1704
a3621e74
YO
1705 {
1706 /*
1707 * Inplace compress the table.*
1708
1709 For sparse data sets the table constructed by the trie algorithm will
1710 be mostly 0/FAIL transitions or to put it another way mostly empty.
1711 (Note that leaf nodes will not contain any transitions.)
1712
1713 This algorithm compresses the tables by eliminating most such
1714 transitions, at the cost of a modest bit of extra work during lookup:
1715
1716 - Each states[] entry contains a .base field which indicates the
1717 index in the state[] array wheres its transition data is stored.
1718
1719 - If .base is 0 there are no valid transitions from that node.
1720
1721 - If .base is nonzero then charid is added to it to find an entry in
1722 the trans array.
1723
1724 -If trans[states[state].base+charid].check!=state then the
1725 transition is taken to be a 0/Fail transition. Thus if there are fail
1726 transitions at the front of the node then the .base offset will point
1727 somewhere inside the previous nodes data (or maybe even into a node
1728 even earlier), but the .check field determines if the transition is
1729 valid.
1730
786e8c11 1731 XXX - wrong maybe?
a3621e74
YO
1732 The following process inplace converts the table to the compressed
1733 table: We first do not compress the root node 1,and mark its all its
1734 .check pointers as 1 and set its .base pointer as 1 as well. This
1735 allows to do a DFA construction from the compressed table later, and
1736 ensures that any .base pointers we calculate later are greater than
1737 0.
1738
1739 - We set 'pos' to indicate the first entry of the second node.
1740
1741 - We then iterate over the columns of the node, finding the first and
1742 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1743 and set the .check pointers accordingly, and advance pos
1744 appropriately and repreat for the next node. Note that when we copy
1745 the next pointers we have to convert them from the original
1746 NODEIDX form to NODENUM form as the former is not valid post
1747 compression.
1748
1749 - If a node has no transitions used we mark its base as 0 and do not
1750 advance the pos pointer.
1751
1752 - If a node only has one transition we use a second pointer into the
1753 structure to fill in allocated fail transitions from other states.
1754 This pointer is independent of the main pointer and scans forward
1755 looking for null transitions that are allocated to a state. When it
1756 finds one it writes the single transition into the "hole". If the
786e8c11 1757 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1758
1759 - Once compressed we can Renew/realloc the structures to release the
1760 excess space.
1761
1762 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1763 specifically Fig 3.47 and the associated pseudocode.
1764
1765 demq
1766 */
a3b680e6 1767 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1768 U32 state, charid;
a3621e74 1769 U32 pos = 0, zp=0;
1e2e3d02 1770 trie->statecount = laststate;
a3621e74
YO
1771
1772 for ( state = 1 ; state < laststate ; state++ ) {
1773 U8 flag = 0;
a28509cc
AL
1774 const U32 stateidx = TRIE_NODEIDX( state );
1775 const U32 o_used = trie->trans[ stateidx ].check;
1776 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1777 trie->trans[ stateidx ].check = 0;
1778
1779 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1780 if ( flag || trie->trans[ stateidx + charid ].next ) {
1781 if ( trie->trans[ stateidx + charid ].next ) {
1782 if (o_used == 1) {
1783 for ( ; zp < pos ; zp++ ) {
1784 if ( ! trie->trans[ zp ].next ) {
1785 break;
1786 }
1787 }
1788 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1789 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1790 trie->trans[ zp ].check = state;
1791 if ( ++zp > pos ) pos = zp;
1792 break;
1793 }
1794 used--;
1795 }
1796 if ( !flag ) {
1797 flag = 1;
1798 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1799 }
1800 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1801 trie->trans[ pos ].check = state;
1802 pos++;
1803 }
1804 }
1805 }
cc601c31 1806 trie->lasttrans = pos + 1;
c944940b
JH
1807 trie->states = (reg_trie_state *)
1808 PerlMemShared_realloc( trie->states, laststate
1809 * sizeof(reg_trie_state) );
a3621e74 1810 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1811 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1812 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1813 (int)depth * 2 + 2,"",
1814 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1815 (IV)next_alloc,
1816 (IV)pos,
a3621e74
YO
1817 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1818 );
1819
1820 } /* end table compress */
1821 }
1e2e3d02
YO
1822 DEBUG_TRIE_COMPILE_MORE_r(
1823 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1824 (int)depth * 2 + 2, "",
1825 (UV)trie->statecount,
1826 (UV)trie->lasttrans)
1827 );
cc601c31 1828 /* resize the trans array to remove unused space */
c944940b
JH
1829 trie->trans = (reg_trie_trans *)
1830 PerlMemShared_realloc( trie->trans, trie->lasttrans
1831 * sizeof(reg_trie_trans) );
a3621e74 1832
3dab1dad 1833 /* and now dump out the compressed format */
2b8b4781 1834 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1835
3dab1dad 1836 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1837 U8 nodetype =(U8)(flags & 0xFF);
1838 char *str=NULL;
786e8c11 1839
07be1b83 1840#ifdef DEBUGGING
e62cc96a 1841 regnode *optimize = NULL;
b57a0404
JH
1842 U32 mjd_offset = 0;
1843 U32 mjd_nodelen = 0;
07be1b83 1844#endif
a3621e74 1845 /*
3dab1dad
YO
1846 This means we convert either the first branch or the first Exact,
1847 depending on whether the thing following (in 'last') is a branch
1848 or not and whther first is the startbranch (ie is it a sub part of
1849 the alternation or is it the whole thing.)
1850 Assuming its a sub part we conver the EXACT otherwise we convert
1851 the whole branch sequence, including the first.
a3621e74 1852 */
3dab1dad 1853 /* Find the node we are going to overwrite */
7f69552c 1854 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1855 /* branch sub-chain */
3dab1dad 1856 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1857 DEBUG_r({
1858 mjd_offset= Node_Offset((convert));
1859 mjd_nodelen= Node_Length((convert));
1860 });
7f69552c
YO
1861 /* whole branch chain */
1862 } else {
1863 DEBUG_r({
1864 const regnode *nop = NEXTOPER( convert );
1865 mjd_offset= Node_Offset((nop));
1866 mjd_nodelen= Node_Length((nop));
1867 });
07be1b83 1868 }
7f69552c 1869
07be1b83
YO
1870 DEBUG_OPTIMISE_r(
1871 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1872 (int)depth * 2 + 2, "",
786e8c11 1873 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1874 );
a3621e74 1875
3dab1dad
YO
1876 /* But first we check to see if there is a common prefix we can
1877 split out as an EXACT and put in front of the TRIE node. */
1878 trie->startstate= 1;
55eed653 1879 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1880 U32 state;
1e2e3d02 1881 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1882 U32 ofs = 0;
8e11feef
RGS
1883 I32 idx = -1;
1884 U32 count = 0;
1885 const U32 base = trie->states[ state ].trans.base;
a3621e74 1886
3dab1dad 1887 if ( trie->states[state].wordnum )
8e11feef 1888 count = 1;
a3621e74 1889
8e11feef 1890 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1891 if ( ( base + ofs >= trie->uniquecharcount ) &&
1892 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1893 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1894 {
3dab1dad 1895 if ( ++count > 1 ) {
2b8b4781 1896 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1897 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1898 if ( state == 1 ) break;
3dab1dad
YO
1899 if ( count == 2 ) {
1900 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1901 DEBUG_OPTIMISE_r(
8e11feef
RGS
1902 PerlIO_printf(Perl_debug_log,
1903 "%*sNew Start State=%"UVuf" Class: [",
1904 (int)depth * 2 + 2, "",
786e8c11 1905 (UV)state));
be8e71aa 1906 if (idx >= 0) {
2b8b4781 1907 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1908 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1909
3dab1dad 1910 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1911 if ( folder )
1912 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1913 DEBUG_OPTIMISE_r(
07be1b83 1914 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1915 );
8e11feef
RGS
1916 }
1917 }
1918 TRIE_BITMAP_SET(trie,*ch);
1919 if ( folder )
1920 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1921 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1922 }
1923 idx = ofs;
1924 }
3dab1dad
YO
1925 }
1926 if ( count == 1 ) {
2b8b4781 1927 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1928 char *ch = SvPV_nolen( *tmp );
1929 DEBUG_OPTIMISE_r({
1930 SV *sv=sv_newmortal();
8e11feef
RGS
1931 PerlIO_printf( Perl_debug_log,
1932 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1933 (int)depth * 2 + 2, "",
de734bd5
A
1934 (UV)state, (UV)idx,
1935 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1936 PL_colors[0], PL_colors[1],
1937 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1938 PERL_PV_ESCAPE_FIRSTCHAR
1939 )
1940 );
1941 });
3dab1dad
YO
1942 if ( state==1 ) {
1943 OP( convert ) = nodetype;
1944 str=STRING(convert);
1945 STR_LEN(convert)=0;
1946 }
de734bd5
A
1947 while (*ch) {
1948 *str++ = *ch++;
1949 STR_LEN(convert)++;
1950 }
1951
8e11feef 1952 } else {
f9049ba1 1953#ifdef DEBUGGING
8e11feef
RGS
1954 if (state>1)
1955 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1956#endif
8e11feef
RGS
1957 break;
1958 }
1959 }
3dab1dad 1960 if (str) {
8e11feef 1961 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1962 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1963 trie->startstate = state;
07be1b83
YO
1964 trie->minlen -= (state - 1);
1965 trie->maxlen -= (state - 1);
1966 DEBUG_r({
1967 regnode *fix = convert;
de734bd5 1968 U32 word = trie->wordcount;
07be1b83
YO
1969 mjd_nodelen++;
1970 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1971 while( ++fix < n ) {
1972 Set_Node_Offset_Length(fix, 0, 0);
1973 }
de734bd5 1974 while (word--) {
2b8b4781 1975 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1976 if (tmp) {
1977 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1978 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1979 else
1980 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1981 }
1982 }
07be1b83 1983 });
8e11feef
RGS
1984 if (trie->maxlen) {
1985 convert = n;
1986 } else {
3dab1dad 1987 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1988 DEBUG_r(optimize= n);
3dab1dad
YO
1989 }
1990 }
1991 }
a5ca303d
YO
1992 if (!jumper)
1993 jumper = last;
3dab1dad 1994 if ( trie->maxlen ) {
8e11feef
RGS
1995 NEXT_OFF( convert ) = (U16)(tail - convert);
1996 ARG_SET( convert, data_slot );
786e8c11
YO
1997 /* Store the offset to the first unabsorbed branch in
1998 jump[0], which is otherwise unused by the jump logic.
1999 We use this when dumping a trie and during optimisation. */
2000 if (trie->jump)
7f69552c 2001 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2002
786e8c11
YO
2003 /* XXXX */
2004 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2005 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2006 {
2007 OP( convert ) = TRIEC;
2008 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2009 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2010 trie->bitmap= NULL;
2011 } else
2012 OP( convert ) = TRIE;
a3621e74 2013
3dab1dad
YO
2014 /* store the type in the flags */
2015 convert->flags = nodetype;
a5ca303d
YO
2016 DEBUG_r({
2017 optimize = convert
2018 + NODE_STEP_REGNODE
2019 + regarglen[ OP( convert ) ];
2020 });
2021 /* XXX We really should free up the resource in trie now,
2022 as we won't use them - (which resources?) dmq */
3dab1dad 2023 }
a3621e74 2024 /* needed for dumping*/
e62cc96a 2025 DEBUG_r(if (optimize) {
07be1b83 2026 regnode *opt = convert;
e62cc96a 2027 while ( ++opt < optimize) {
07be1b83
YO
2028 Set_Node_Offset_Length(opt,0,0);
2029 }
786e8c11
YO
2030 /*
2031 Try to clean up some of the debris left after the
2032 optimisation.
a3621e74 2033 */
786e8c11 2034 while( optimize < jumper ) {
07be1b83 2035 mjd_nodelen += Node_Length((optimize));
a3621e74 2036 OP( optimize ) = OPTIMIZED;
07be1b83 2037 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2038 optimize++;
2039 }
07be1b83 2040 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2041 });
2042 } /* end node insert */
55eed653 2043 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2044#ifdef DEBUGGING
2045 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2046 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2047#else
2048 SvREFCNT_dec(revcharmap);
07be1b83 2049#endif
786e8c11
YO
2050 return trie->jump
2051 ? MADE_JUMP_TRIE
2052 : trie->startstate>1
2053 ? MADE_EXACT_TRIE
2054 : MADE_TRIE;
2055}
2056
2057STATIC void
2058S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2059{
2060/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2061
2062 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2063 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2064 ISBN 0-201-10088-6
2065
2066 We find the fail state for each state in the trie, this state is the longest proper
2067 suffix of the current states 'word' that is also a proper prefix of another word in our
2068 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2069 the DFA not to have to restart after its tried and failed a word at a given point, it
2070 simply continues as though it had been matching the other word in the first place.
2071 Consider
2072 'abcdgu'=~/abcdefg|cdgu/
2073 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2074 fail, which would bring use to the state representing 'd' in the second word where we would
2075 try 'g' and succeed, prodceding to match 'cdgu'.
2076 */
2077 /* add a fail transition */
3251b653
NC
2078 const U32 trie_offset = ARG(source);
2079 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2080 U32 *q;
2081 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2082 const U32 numstates = trie->statecount;
786e8c11
YO
2083 const U32 ubound = trie->lasttrans + ucharcount;
2084 U32 q_read = 0;
2085 U32 q_write = 0;
2086 U32 charid;
2087 U32 base = trie->states[ 1 ].trans.base;
2088 U32 *fail;
2089 reg_ac_data *aho;
2090 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2091 GET_RE_DEBUG_FLAGS_DECL;
2092#ifndef DEBUGGING
2093 PERL_UNUSED_ARG(depth);
2094#endif
2095
2096
2097 ARG_SET( stclass, data_slot );
c944940b 2098 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2099 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2100 aho->trie=trie_offset;
446bd890
NC
2101 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2102 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2103 Newxz( q, numstates, U32);
c944940b 2104 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2105 aho->refcount = 1;
2106 fail = aho->fail;
2107 /* initialize fail[0..1] to be 1 so that we always have
2108 a valid final fail state */
2109 fail[ 0 ] = fail[ 1 ] = 1;
2110
2111 for ( charid = 0; charid < ucharcount ; charid++ ) {
2112 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2113 if ( newstate ) {
2114 q[ q_write ] = newstate;
2115 /* set to point at the root */
2116 fail[ q[ q_write++ ] ]=1;
2117 }
2118 }
2119 while ( q_read < q_write) {
2120 const U32 cur = q[ q_read++ % numstates ];
2121 base = trie->states[ cur ].trans.base;
2122
2123 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2124 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2125 if (ch_state) {
2126 U32 fail_state = cur;
2127 U32 fail_base;
2128 do {
2129 fail_state = fail[ fail_state ];
2130 fail_base = aho->states[ fail_state ].trans.base;
2131 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2132
2133 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2134 fail[ ch_state ] = fail_state;
2135 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2136 {
2137 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2138 }
2139 q[ q_write++ % numstates] = ch_state;
2140 }
2141 }
2142 }
2143 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2144 when we fail in state 1, this allows us to use the
2145 charclass scan to find a valid start char. This is based on the principle
2146 that theres a good chance the string being searched contains lots of stuff
2147 that cant be a start char.
2148 */
2149 fail[ 0 ] = fail[ 1 ] = 0;
2150 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2151 PerlIO_printf(Perl_debug_log,
2152 "%*sStclass Failtable (%"UVuf" states): 0",
2153 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2154 );
786e8c11
YO
2155 for( q_read=1; q_read<numstates; q_read++ ) {
2156 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2157 }
2158 PerlIO_printf(Perl_debug_log, "\n");
2159 });
2160 Safefree(q);
2161 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2162}
2163
786e8c11 2164
a3621e74 2165/*
5d1c421c
JH
2166 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2167 * These need to be revisited when a newer toolchain becomes available.
2168 */
2169#if defined(__sparc64__) && defined(__GNUC__)
2170# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2171# undef SPARC64_GCC_WORKAROUND
2172# define SPARC64_GCC_WORKAROUND 1
2173# endif
2174#endif
2175
07be1b83 2176#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2177 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2178 SV * const mysv=sv_newmortal(); \
2179 regnode *Next = regnext(scan); \
2180 regprop(RExC_rx, mysv, scan); \
7f69552c 2181 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2182 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2183 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2184 }});
07be1b83 2185
1de06328
YO
2186
2187
2188
2189
07be1b83
YO
2190#define JOIN_EXACT(scan,min,flags) \
2191 if (PL_regkind[OP(scan)] == EXACT) \
2192 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2193
be8e71aa 2194STATIC U32
07be1b83
YO
2195S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2196 /* Merge several consecutive EXACTish nodes into one. */
2197 regnode *n = regnext(scan);
2198 U32 stringok = 1;
2199 regnode *next = scan + NODE_SZ_STR(scan);
2200 U32 merged = 0;
2201 U32 stopnow = 0;
2202#ifdef DEBUGGING
2203 regnode *stop = scan;
72f13be8 2204 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2205#else
d47053eb
RGS
2206 PERL_UNUSED_ARG(depth);
2207#endif
2208#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2209 PERL_UNUSED_ARG(flags);
2210 PERL_UNUSED_ARG(val);
07be1b83 2211#endif
07be1b83
YO
2212 DEBUG_PEEP("join",scan,depth);
2213
2214 /* Skip NOTHING, merge EXACT*. */
2215 while (n &&
2216 ( PL_regkind[OP(n)] == NOTHING ||
2217 (stringok && (OP(n) == OP(scan))))
2218 && NEXT_OFF(n)
2219 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2220
2221 if (OP(n) == TAIL || n > next)
2222 stringok = 0;
2223 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2224 DEBUG_PEEP("skip:",n,depth);
2225 NEXT_OFF(scan) += NEXT_OFF(n);
2226 next = n + NODE_STEP_REGNODE;
2227#ifdef DEBUGGING
2228 if (stringok)
2229 stop = n;
2230#endif
2231 n = regnext(n);
2232 }
2233 else if (stringok) {
786e8c11 2234 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2235 regnode * const nnext = regnext(n);
2236
2237 DEBUG_PEEP("merg",n,depth);
2238
2239 merged++;
2240 if (oldl + STR_LEN(n) > U8_MAX)
2241 break;
2242 NEXT_OFF(scan) += NEXT_OFF(n);
2243 STR_LEN(scan) += STR_LEN(n);
2244 next = n + NODE_SZ_STR(n);
2245 /* Now we can overwrite *n : */
2246 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2247#ifdef DEBUGGING
2248 stop = next - 1;
2249#endif
2250 n = nnext;
2251 if (stopnow) break;
2252 }
2253
d47053eb
RGS
2254#ifdef EXPERIMENTAL_INPLACESCAN
2255 if (flags && !NEXT_OFF(n)) {
2256 DEBUG_PEEP("atch", val, depth);
2257 if (reg_off_by_arg[OP(n)]) {
2258 ARG_SET(n, val - n);
2259 }
2260 else {
2261 NEXT_OFF(n) = val - n;
2262 }
2263 stopnow = 1;
2264 }
07be1b83
YO
2265#endif
2266 }
2267
2268 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2269 /*
2270 Two problematic code points in Unicode casefolding of EXACT nodes:
2271
2272 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2273 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2274
2275 which casefold to
2276
2277 Unicode UTF-8
2278
2279 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2280 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2281
2282 This means that in case-insensitive matching (or "loose matching",
2283 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2284 length of the above casefolded versions) can match a target string
2285 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2286 This would rather mess up the minimum length computation.
2287
2288 What we'll do is to look for the tail four bytes, and then peek
2289 at the preceding two bytes to see whether we need to decrease
2290 the minimum length by four (six minus two).
2291
2292 Thanks to the design of UTF-8, there cannot be false matches:
2293 A sequence of valid UTF-8 bytes cannot be a subsequence of
2294 another valid sequence of UTF-8 bytes.
2295
2296 */
2297 char * const s0 = STRING(scan), *s, *t;
2298 char * const s1 = s0 + STR_LEN(scan) - 1;
2299 char * const s2 = s1 - 4;
e294cc5d
JH
2300#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2301 const char t0[] = "\xaf\x49\xaf\x42";
2302#else
07be1b83 2303 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2304#endif
07be1b83
YO
2305 const char * const t1 = t0 + 3;
2306
2307 for (s = s0 + 2;
2308 s < s2 && (t = ninstr(s, s1, t0, t1));
2309 s = t + 4) {
e294cc5d
JH
2310#ifdef EBCDIC
2311 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2312 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2313#else
07be1b83
YO
2314 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2315 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2316#endif
07be1b83
YO
2317 *min -= 4;
2318 }
2319 }
2320
2321#ifdef DEBUGGING
2322 /* Allow dumping */
2323 n = scan + NODE_SZ_STR(scan);
2324 while (n <= stop) {
2325 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2326 OP(n) = OPTIMIZED;
2327 NEXT_OFF(n) = 0;
2328 }
2329 n++;
2330 }
2331#endif
2332 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2333 return stopnow;
2334}
2335
653099ff
GS
2336/* REx optimizer. Converts nodes into quickier variants "in place".
2337 Finds fixed substrings. */
2338
a0288114 2339/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2340 to the position after last scanned or to NULL. */
2341
40d049e4
YO
2342#define INIT_AND_WITHP \
2343 assert(!and_withp); \
2344 Newx(and_withp,1,struct regnode_charclass_class); \
2345 SAVEFREEPV(and_withp)
07be1b83 2346
b515a41d
YO
2347/* this is a chain of data about sub patterns we are processing that
2348 need to be handled seperately/specially in study_chunk. Its so
2349 we can simulate recursion without losing state. */
2350struct scan_frame;
2351typedef struct scan_frame {
2352 regnode *last; /* last node to process in this frame */
2353 regnode *next; /* next node to process when last is reached */
2354 struct scan_frame *prev; /*previous frame*/
2355 I32 stop; /* what stopparen do we use */
2356} scan_frame;
2357
304ee84b
YO
2358
2359#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2360
76e3520e 2361STATIC I32
40d049e4 2362S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2363 I32 *minlenp, I32 *deltap,
40d049e4
YO
2364 regnode *last,
2365 scan_data_t *data,
2366 I32 stopparen,
2367 U8* recursed,
2368 struct regnode_charclass_class *and_withp,
2369 U32 flags, U32 depth)
c277df42
IZ
2370 /* scanp: Start here (read-write). */
2371 /* deltap: Write maxlen-minlen here. */
2372 /* last: Stop before this one. */
40d049e4
YO
2373 /* data: string data about the pattern */
2374 /* stopparen: treat close N as END */
2375 /* recursed: which subroutines have we recursed into */
2376 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2377{
97aff369 2378 dVAR;
c277df42
IZ
2379 I32 min = 0, pars = 0, code;
2380 regnode *scan = *scanp, *next;
2381 I32 delta = 0;
2382 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2383 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2384 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2385 scan_data_t data_fake;
a3621e74 2386 SV *re_trie_maxbuff = NULL;
786e8c11 2387 regnode *first_non_open = scan;
e2e6a0f1 2388 I32 stopmin = I32_MAX;
8aa23a47
YO
2389 scan_frame *frame = NULL;
2390
a3621e74 2391 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2392
13a24bad 2393#ifdef DEBUGGING
40d049e4 2394 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2395#endif
40d049e4 2396
786e8c11 2397 if ( depth == 0 ) {
40d049e4 2398 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2399 first_non_open=regnext(first_non_open);
2400 }
2401
b81d288d 2402
8aa23a47
YO
2403 fake_study_recurse:
2404 while ( scan && OP(scan) != END && scan < last ){
2405 /* Peephole optimizer: */
304ee84b 2406 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2407 DEBUG_PEEP("Peep",scan,depth);
2408 JOIN_EXACT(scan,&min,0);
2409
2410 /* Follow the next-chain of the current node and optimize
2411 away all the NOTHINGs from it. */
2412 if (OP(scan) != CURLYX) {
2413 const int max = (reg_off_by_arg[OP(scan)]
2414 ? I32_MAX
2415 /* I32 may be smaller than U16 on CRAYs! */
2416 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2417 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2418 int noff;
2419 regnode *n = scan;
2420
2421 /* Skip NOTHING and LONGJMP. */
2422 while ((n = regnext(n))
2423 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2424 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2425 && off + noff < max)
2426 off += noff;
2427 if (reg_off_by_arg[OP(scan)])
2428 ARG(scan) = off;
2429 else
2430 NEXT_OFF(scan) = off;
2431 }
a3621e74 2432
c277df42 2433
8aa23a47
YO
2434
2435 /* The principal pseudo-switch. Cannot be a switch, since we
2436 look into several different things. */
2437 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2438 || OP(scan) == IFTHEN) {
2439 next = regnext(scan);
2440 code = OP(scan);
2441 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2442
2443 if (OP(next) == code || code == IFTHEN) {
2444 /* NOTE - There is similar code to this block below for handling
2445 TRIE nodes on a re-study. If you change stuff here check there
2446 too. */
2447 I32 max1 = 0, min1 = I32_MAX, num = 0;
2448 struct regnode_charclass_class accum;
2449 regnode * const startbranch=scan;
2450
2451 if (flags & SCF_DO_SUBSTR)
304ee84b 2452 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2453 if (flags & SCF_DO_STCLASS)
2454 cl_init_zero(pRExC_state, &accum);
2455
2456 while (OP(scan) == code) {
2457 I32 deltanext, minnext, f = 0, fake;
2458 struct regnode_charclass_class this_class;
2459
2460 num++;
2461 data_fake.flags = 0;
2462 if (data) {
2463 data_fake.whilem_c = data->whilem_c;
2464 data_fake.last_closep = data->last_closep;
2465 }
2466 else
2467 data_fake.last_closep = &fake;
58e23c8d
YO
2468
2469 data_fake.pos_delta = delta;
8aa23a47
YO
2470 next = regnext(scan);
2471 scan = NEXTOPER(scan);
2472 if (code != BRANCH)
c277df42 2473 scan = NEXTOPER(scan);
8aa23a47
YO
2474 if (flags & SCF_DO_STCLASS) {
2475 cl_init(pRExC_state, &this_class);
2476 data_fake.start_class = &this_class;
2477 f = SCF_DO_STCLASS_AND;
58e23c8d 2478 }
8aa23a47
YO
2479 if (flags & SCF_WHILEM_VISITED_POS)
2480 f |= SCF_WHILEM_VISITED_POS;
2481
2482 /* we suppose the run is continuous, last=next...*/
2483 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2484 next, &data_fake,
2485 stopparen, recursed, NULL, f,depth+1);
2486 if (min1 > minnext)
2487 min1 = minnext;
2488 if (max1 < minnext + deltanext)
2489 max1 = minnext + deltanext;
2490 if (deltanext == I32_MAX)
2491 is_inf = is_inf_internal = 1;
2492 scan = next;
2493 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2494 pars++;
2495 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2496 if ( stopmin > minnext)
2497 stopmin = min + min1;
2498 flags &= ~SCF_DO_SUBSTR;
2499 if (data)
2500 data->flags |= SCF_SEEN_ACCEPT;
2501 }
2502 if (data) {
2503 if (data_fake.flags & SF_HAS_EVAL)
2504 data->flags |= SF_HAS_EVAL;
2505 data->whilem_c = data_fake.whilem_c;
3dab1dad 2506 }
8aa23a47
YO
2507 if (flags & SCF_DO_STCLASS)
2508 cl_or(pRExC_state, &accum, &this_class);
2509 }
2510 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2511 min1 = 0;
2512 if (flags & SCF_DO_SUBSTR) {
2513 data->pos_min += min1;
2514 data->pos_delta += max1 - min1;
2515 if (max1 != min1 || is_inf)
2516 data->longest = &(data->longest_float);
2517 }
2518 min += min1;
2519 delta += max1 - min1;
2520 if (flags & SCF_DO_STCLASS_OR) {
2521 cl_or(pRExC_state, data->start_class, &accum);
2522 if (min1) {
2523 cl_and(data->start_class, and_withp);
2524 flags &= ~SCF_DO_STCLASS;
653099ff 2525 }
8aa23a47
YO
2526 }
2527 else if (flags & SCF_DO_STCLASS_AND) {
2528 if (min1) {
2529 cl_and(data->start_class, &accum);
2530 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2531 }
8aa23a47
YO
2532 else {
2533 /* Switch to OR mode: cache the old value of
2534 * data->start_class */
2535 INIT_AND_WITHP;
2536 StructCopy(data->start_class, and_withp,
2537 struct regnode_charclass_class);
2538 flags &= ~SCF_DO_STCLASS_AND;
2539 StructCopy(&accum, data->start_class,
2540 struct regnode_charclass_class);
2541 flags |= SCF_DO_STCLASS_OR;
2542 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2543 }
8aa23a47 2544 }
a3621e74 2545
8aa23a47
YO
2546 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2547 /* demq.
a3621e74 2548
8aa23a47
YO
2549 Assuming this was/is a branch we are dealing with: 'scan' now
2550 points at the item that follows the branch sequence, whatever
2551 it is. We now start at the beginning of the sequence and look
2552 for subsequences of
a3621e74 2553
8aa23a47
YO
2554 BRANCH->EXACT=>x1
2555 BRANCH->EXACT=>x2
2556 tail
a3621e74 2557
8aa23a47 2558 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2559
8aa23a47
YO
2560 If we can find such a subseqence we need to turn the first
2561 element into a trie and then add the subsequent branch exact
2562 strings to the trie.
a3621e74 2563
8aa23a47 2564 We have two cases
a3621e74 2565
8aa23a47 2566 1. patterns where the whole set of branch can be converted.
a3621e74 2567
8aa23a47 2568 2. patterns where only a subset can be converted.
a3621e74 2569
8aa23a47
YO
2570 In case 1 we can replace the whole set with a single regop
2571 for the trie. In case 2 we need to keep the start and end
2572 branchs so
a3621e74 2573
8aa23a47
YO
2574 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2575 becomes BRANCH TRIE; BRANCH X;
786e8c11 2576
8aa23a47
YO
2577 There is an additional case, that being where there is a
2578 common prefix, which gets split out into an EXACT like node
2579 preceding the TRIE node.
a3621e74 2580
8aa23a47
YO
2581 If x(1..n)==tail then we can do a simple trie, if not we make
2582 a "jump" trie, such that when we match the appropriate word
2583 we "jump" to the appopriate tail node. Essentailly we turn
2584 a nested if into a case structure of sorts.
b515a41d 2585
8aa23a47
YO
2586 */
2587
2588 int made=0;
2589 if (!re_trie_maxbuff) {
2590 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2591 if (!SvIOK(re_trie_maxbuff))
2592 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2593 }
2594 if ( SvIV(re_trie_maxbuff)>=0 ) {
2595 regnode *cur;
2596 regnode *first = (regnode *)NULL;
2597 regnode *last = (regnode *)NULL;
2598 regnode *tail = scan;
2599 U8 optype = 0;
2600 U32 count=0;
a3621e74
YO
2601
2602#ifdef DEBUGGING
8aa23a47 2603 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2604#endif
8aa23a47
YO
2605 /* var tail is used because there may be a TAIL
2606 regop in the way. Ie, the exacts will point to the
2607 thing following the TAIL, but the last branch will
2608 point at the TAIL. So we advance tail. If we
2609 have nested (?:) we may have to move through several
2610 tails.
2611 */
2612
2613 while ( OP( tail ) == TAIL ) {
2614 /* this is the TAIL generated by (?:) */
2615 tail = regnext( tail );
2616 }
a3621e74 2617
8aa23a47
YO
2618
2619 DEBUG_OPTIMISE_r({
2620 regprop(RExC_rx, mysv, tail );
2621 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2622 (int)depth * 2 + 2, "",
2623 "Looking for TRIE'able sequences. Tail node is: ",
2624 SvPV_nolen_const( mysv )
2625 );
2626 });
2627
2628 /*
2629
2630 step through the branches, cur represents each
2631 branch, noper is the first thing to be matched
2632 as part of that branch and noper_next is the
2633 regnext() of that node. if noper is an EXACT
2634 and noper_next is the same as scan (our current
2635 position in the regex) then the EXACT branch is
2636 a possible optimization target. Once we have
2637 two or more consequetive such branches we can
2638 create a trie of the EXACT's contents and stich
2639 it in place. If the sequence represents all of
2640 the branches we eliminate the whole thing and
2641 replace it with a single TRIE. If it is a
2642 subsequence then we need to stitch it in. This
2643 means the first branch has to remain, and needs
2644 to be repointed at the item on the branch chain
2645 following the last branch optimized. This could
2646 be either a BRANCH, in which case the
2647 subsequence is internal, or it could be the
2648 item following the branch sequence in which
2649 case the subsequence is at the end.
2650
2651 */
2652
2653 /* dont use tail as the end marker for this traverse */
2654 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2655 regnode * const noper = NEXTOPER( cur );
b515a41d 2656#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2657 regnode * const noper_next = regnext( noper );
b515a41d
YO
2658#endif
2659
8aa23a47
YO
2660 DEBUG_OPTIMISE_r({
2661 regprop(RExC_rx, mysv, cur);
2662 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2663 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2664
2665 regprop(RExC_rx, mysv, noper);
2666 PerlIO_printf( Perl_debug_log, " -> %s",
2667 SvPV_nolen_const(mysv));
2668
2669 if ( noper_next ) {
2670 regprop(RExC_rx, mysv, noper_next );
2671 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2672 SvPV_nolen_const(mysv));
2673 }
2674 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2675 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2676 });
2677 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2678 : PL_regkind[ OP( noper ) ] == EXACT )
2679 || OP(noper) == NOTHING )
786e8c11 2680#ifdef NOJUMPTRIE
8aa23a47 2681 && noper_next == tail
786e8c11 2682#endif
8aa23a47
YO
2683 && count < U16_MAX)
2684 {
2685 count++;
2686 if ( !first || optype == NOTHING ) {
2687 if (!first) first = cur;
2688 optype = OP( noper );
2689 } else {
2690 last = cur;
2691 }
2692 } else {
2693 if ( last ) {
2694 make_trie( pRExC_state,
2695 startbranch, first, cur, tail, count,
2696 optype, depth+1 );
2697 }
2698 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2699#ifdef NOJUMPTRIE
8aa23a47 2700 && noper_next == tail
786e8c11 2701#endif
8aa23a47
YO
2702 ){
2703 count = 1;
2704 first = cur;
2705 optype = OP( noper );
2706 } else {
2707 count = 0;
2708 first = NULL;
2709 optype = 0;
2710 }
2711 last = NULL;
2712 }
2713 }
2714 DEBUG_OPTIMISE_r({
2715 regprop(RExC_rx, mysv, cur);
2716 PerlIO_printf( Perl_debug_log,
2717 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2718 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2719
2720 });
2721 if ( last ) {
2722 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2723#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2724 if ( ((made == MADE_EXACT_TRIE &&
2725 startbranch == first)
2726 || ( first_non_open == first )) &&
2727 depth==0 ) {
2728 flags |= SCF_TRIE_RESTUDY;
2729 if ( startbranch == first
2730 && scan == tail )
2731 {
2732 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2733 }
2734 }
3dab1dad 2735#endif
8aa23a47
YO
2736 }
2737 }
2738
2739 } /* do trie */
2740
653099ff 2741 }
8aa23a47
YO
2742 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2743 scan = NEXTOPER(NEXTOPER(scan));
2744 } else /* single branch is optimized. */
2745 scan = NEXTOPER(scan);
2746 continue;
2747 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2748 scan_frame *newframe = NULL;
2749 I32 paren;
2750 regnode *start;
2751 regnode *end;
2752
2753 if (OP(scan) != SUSPEND) {
2754 /* set the pointer */
2755 if (OP(scan) == GOSUB) {
2756 paren = ARG(scan);
2757 RExC_recurse[ARG2L(scan)] = scan;
2758 start = RExC_open_parens[paren-1];
2759 end = RExC_close_parens[paren-1];
2760 } else {
2761 paren = 0;
f8fc2ecf 2762 start = RExC_rxi->program + 1;
8aa23a47
YO
2763 end = RExC_opend;
2764 }
2765 if (!recursed) {
2766 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2767 SAVEFREEPV(recursed);
2768 }
2769 if (!PAREN_TEST(recursed,paren+1)) {
2770 PAREN_SET(recursed,paren+1);
2771 Newx(newframe,1,scan_frame);
2772 } else {
2773 if (flags & SCF_DO_SUBSTR) {
304ee84b 2774 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2775 data->longest = &(data->longest_float);
2776 }
2777 is_inf = is_inf_internal = 1;
2778 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2779 cl_anything(pRExC_state, data->start_class);
2780 flags &= ~SCF_DO_STCLASS;
2781 }
2782 } else {
2783 Newx(newframe,1,scan_frame);
2784 paren = stopparen;
2785 start = scan+2;
2786 end = regnext(scan);
2787 }
2788 if (newframe) {
2789 assert(start);
2790 assert(end);
2791 SAVEFREEPV(newframe);
2792 newframe->next = regnext(scan);
2793 newframe->last = last;
2794 newframe->stop = stopparen;
2795 newframe->prev = frame;
2796
2797 frame = newframe;
2798 scan = start;
2799 stopparen = paren;
2800 last = end;
2801
2802 continue;
2803 }
2804 }
2805 else if (OP(scan) == EXACT) {
2806 I32 l = STR_LEN(scan);
2807 UV uc;
2808 if (UTF) {
2809 const U8 * const s = (U8*)STRING(scan);
2810 l = utf8_length(s, s + l);
2811 uc = utf8_to_uvchr(s, NULL);
2812 } else {
2813 uc = *((U8*)STRING(scan));
2814 }
2815 min += l;
2816 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2817 /* The code below prefers earlier match for fixed
2818 offset, later match for variable offset. */
2819 if (data->last_end == -1) { /* Update the start info. */
2820 data->last_start_min = data->pos_min;
2821 data->last_start_max = is_inf
2822 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2823 }
8aa23a47
YO
2824 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2825 if (UTF)
2826 SvUTF8_on(data->last_found);
2827 {
2828 SV * const sv = data->last_found;
2829 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2830 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2831 if (mg && mg->mg_len >= 0)
2832 mg->mg_len += utf8_length((U8*)STRING(scan),
2833 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2834 }
8aa23a47
YO
2835 data->last_end = data->pos_min + l;
2836 data->pos_min += l; /* As in the first entry. */
2837 data->flags &= ~SF_BEFORE_EOL;
2838 }
2839 if (flags & SCF_DO_STCLASS_AND) {
2840 /* Check whether it is compatible with what we know already! */
2841 int compat = 1;
2842
2843 if (uc >= 0x100 ||
2844 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2845 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2846 && (!(data->start_class->flags & ANYOF_FOLD)
2847 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2848 )
2849 compat = 0;
2850 ANYOF_CLASS_ZERO(data->start_class);
2851 ANYOF_BITMAP_ZERO(data->start_class);
2852 if (compat)
2853 ANYOF_BITMAP_SET(data->start_class, uc);
2854 data->start_class->flags &= ~ANYOF_EOS;
2855 if (uc < 0x100)
2856 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2857 }
2858 else if (flags & SCF_DO_STCLASS_OR) {
2859 /* false positive possible if the class is case-folded */
2860 if (uc < 0x100)
2861 ANYOF_BITMAP_SET(data->start_class, uc);
2862 else
2863 data->start_class->flags |= ANYOF_UNICODE_ALL;
2864 data->start_class->flags &= ~ANYOF_EOS;
2865 cl_and(data->start_class, and_withp);
2866 }
2867 flags &= ~SCF_DO_STCLASS;
2868 }
2869 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2870 I32 l = STR_LEN(scan);
2871 UV uc = *((U8*)STRING(scan));
2872
2873 /* Search for fixed substrings supports EXACT only. */
2874 if (flags & SCF_DO_SUBSTR) {
2875 assert(data);
304ee84b 2876 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2877 }
2878 if (UTF) {
2879 const U8 * const s = (U8 *)STRING(scan);
2880 l = utf8_length(s, s + l);
2881 uc = utf8_to_uvchr(s, NULL);
2882 }
2883 min += l;
2884 if (flags & SCF_DO_SUBSTR)
2885 data->pos_min += l;
2886 if (flags & SCF_DO_STCLASS_AND) {
2887 /* Check whether it is compatible with what we know already! */
2888 int compat = 1;
2889
2890 if (uc >= 0x100 ||
2891 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2892 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2893 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2894 compat = 0;
2895 ANYOF_CLASS_ZERO(data->start_class);
2896 ANYOF_BITMAP_ZERO(data->start_class);
2897 if (compat) {
2898 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2899 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2900 data->start_class->flags |= ANYOF_FOLD;
2901 if (OP(scan) == EXACTFL)
2902 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2903 }
8aa23a47
YO
2904 }
2905 else if (flags & SCF_DO_STCLASS_OR) {
2906 if (data->start_class->flags & ANYOF_FOLD) {
2907 /* false positive possible if the class is case-folded.
2908 Assume that the locale settings are the same... */
1aa99e6b
IH
2909 if (uc < 0x100)
2910 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2911 data->start_class->flags &= ~ANYOF_EOS;
2912 }
8aa23a47 2913 cl_and(data->start_class, and_withp);
653099ff 2914 }
8aa23a47
YO
2915 flags &= ~SCF_DO_STCLASS;
2916 }
2917 else if (strchr((const char*)PL_varies,OP(scan))) {
2918 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2919 I32 f = flags, pos_before = 0;
2920 regnode * const oscan = scan;
2921 struct regnode_charclass_class this_class;
2922 struct regnode_charclass_class *oclass = NULL;
2923 I32 next_is_eval = 0;
2924
2925 switch (PL_regkind[OP(scan)]) {
2926 case WHILEM: /* End of (?:...)* . */
2927 scan = NEXTOPER(scan);
2928 goto finish;
2929 case PLUS:
2930 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2931 next = NEXTOPER(scan);
2932 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2933 mincount = 1;
2934 maxcount = REG_INFTY;
2935 next = regnext(scan);
2936 scan = NEXTOPER(scan);
2937 goto do_curly;
2938 }
2939 }
2940 if (flags & SCF_DO_SUBSTR)
2941 data->pos_min++;
2942 min++;
2943 /* Fall through. */
2944 case STAR:
2945 if (flags & SCF_DO_STCLASS) {
2946 mincount = 0;
2947 maxcount = REG_INFTY;
2948 next = regnext(scan);
2949 scan = NEXTOPER(scan);
2950 goto do_curly;
2951 }
2952 is_inf = is_inf_internal = 1;
2953 scan = regnext(scan);
c277df42 2954 if (flags & SCF_DO_SUBSTR) {
304ee84b 2955 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2956 data->longest = &(data->longest_float);
c277df42 2957 }
8aa23a47
YO
2958 goto optimize_curly_tail;
2959 case CURLY:
2960 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2961 && (scan->flags == stopparen))
2962 {
2963 mincount = 1;
2964 maxcount = 1;
2965 } else {
2966 mincount = ARG1(scan);
2967 maxcount = ARG2(scan);
653099ff 2968 }
8aa23a47
YO
2969 next = regnext(scan);
2970 if (OP(scan) == CURLYX) {
2971 I32 lp = (data ? *(data->last_closep) : 0);
2972 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2973 }
8aa23a47
YO
2974 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2975 next_is_eval = (OP(scan) == EVAL);
2976 do_curly:
2977 if (flags & SCF_DO_SUBSTR) {
304ee84b 2978 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 2979 pos_before = data->pos_min;
b45f050a 2980 }
8aa23a47
YO
2981 if (data) {
2982 fl = data->flags;
2983 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2984 if (is_inf)
2985 data->flags |= SF_IS_INF;
2986 }
2987 if (flags & SCF_DO_STCLASS) {
2988 cl_init(pRExC_state, &this_class);
2989 oclass = data->start_class;
2990 data->start_class = &this_class;
2991 f |= SCF_DO_STCLASS_AND;
2992 f &= ~SCF_DO_STCLASS_OR;
2993 }
2994 /* These are the cases when once a subexpression
2995 fails at a particular position, it cannot succeed
2996 even after backtracking at the enclosing scope.
2997
2998 XXXX what if minimal match and we are at the
2999 initial run of {n,m}? */
3000 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3001 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3002
8aa23a47
YO
3003 /* This will finish on WHILEM, setting scan, or on NULL: */
3004 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3005 last, data, stopparen, recursed, NULL,
3006 (mincount == 0
3007 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3008
8aa23a47
YO
3009 if (flags & SCF_DO_STCLASS)
3010 data->start_class = oclass;
3011 if (mincount == 0 || minnext == 0) {
3012 if (flags & SCF_DO_STCLASS_OR) {
3013 cl_or(pRExC_state, data->start_class, &this_class);
3014 }
3015 else if (flags & SCF_DO_STCLASS_AND) {
3016 /* Switch to OR mode: cache the old value of
3017 * data->start_class */
3018 INIT_AND_WITHP;
3019 StructCopy(data->start_class, and_withp,
3020 struct regnode_charclass_class);
3021 flags &= ~SCF_DO_STCLASS_AND;
3022 StructCopy(&this_class, data->start_class,
3023 struct regnode_charclass_class);
3024 flags |= SCF_DO_STCLASS_OR;
3025 data->start_class->flags |= ANYOF_EOS;
3026 }
3027 } else { /* Non-zero len */
3028 if (flags & SCF_DO_STCLASS_OR) {
3029 cl_or(pRExC_state, data->start_class, &this_class);
3030 cl_and(data->start_class, and_withp);
3031 }
3032 else if (flags & SCF_DO_STCLASS_AND)
3033 cl_and(data->start_class, &this_class);
3034 flags &= ~SCF_DO_STCLASS;
3035 }
3036 if (!scan) /* It was not CURLYX, but CURLY. */
3037 scan = next;
3038 if ( /* ? quantifier ok, except for (?{ ... }) */
3039 (next_is_eval || !(mincount == 0 && maxcount == 1))
3040 && (minnext == 0) && (deltanext == 0)
3041 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3042 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3043 && ckWARN(WARN_REGEXP))
3044 {
3045 vWARN(RExC_parse,
3046 "Quantifier unexpected on zero-length expression");
3047 }
3048
3049 min += minnext * mincount;
3050 is_inf_internal |= ((maxcount == REG_INFTY
3051 && (minnext + deltanext) > 0)
3052 || deltanext == I32_MAX);
3053 is_inf |= is_inf_internal;
3054 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3055
3056 /* Try powerful optimization CURLYX => CURLYN. */
3057 if ( OP(oscan) == CURLYX && data
3058 && data->flags & SF_IN_PAR
3059 && !(data->flags & SF_HAS_EVAL)
3060 && !deltanext && minnext == 1 ) {
3061 /* Try to optimize to CURLYN. */
3062 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3063 regnode * const nxt1 = nxt;
497b47a8 3064#ifdef DEBUGGING
8aa23a47 3065 regnode *nxt2;
497b47a8 3066#endif
c277df42 3067
8aa23a47
YO
3068 /* Skip open. */
3069 nxt = regnext(nxt);
3070 if (!strchr((const char*)PL_simple,OP(nxt))
3071 && !(PL_regkind[OP(nxt)] == EXACT
3072 && STR_LEN(nxt) == 1))
3073 goto nogo;
497b47a8 3074#ifdef DEBUGGING
8aa23a47 3075 nxt2 = nxt;
497b47a8 3076#endif
8aa23a47
YO
3077 nxt = regnext(nxt);
3078 if (OP(nxt) != CLOSE)
3079 goto nogo;
3080 if (RExC_open_parens) {
3081 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3082 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3083 }
3084 /* Now we know that nxt2 is the only contents: */
3085 oscan->flags = (U8)ARG(nxt);
3086 OP(oscan) = CURLYN;
3087 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3088
c277df42 3089#ifdef DEBUGGING
8aa23a47
YO
3090 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3091 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3092 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3093 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3094 OP(nxt + 1) = OPTIMIZED; /* was count. */
3095 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3096#endif
8aa23a47
YO
3097 }
3098 nogo:
3099
3100 /* Try optimization CURLYX => CURLYM. */
3101 if ( OP(oscan) == CURLYX && data
3102 && !(data->flags & SF_HAS_PAR)
3103 && !(data->flags & SF_HAS_EVAL)
3104 && !deltanext /* atom is fixed width */
3105 && minnext != 0 /* CURLYM can't handle zero width */
3106 ) {
3107 /* XXXX How to optimize if data == 0? */
3108 /* Optimize to a simpler form. */
3109 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3110 regnode *nxt2;
3111
3112 OP(oscan) = CURLYM;
3113 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3114 && (OP(nxt2) != WHILEM))
3115 nxt = nxt2;
3116 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3117 /* Need to optimize away parenths. */
3118 if (data->flags & SF_IN_PAR) {
3119 /* Set the parenth number. */
3120 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3121
3122 if (OP(nxt) != CLOSE)
3123 FAIL("Panic opt close");
3124 oscan->flags = (U8)ARG(nxt);
3125 if (RExC_open_parens) {
3126 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3127 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3128 }
8aa23a47
YO
3129 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3130 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3131
c277df42 3132#ifdef DEBUGGING
8aa23a47
YO
3133 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3134 OP(nxt + 1) = OPTIMIZED; /* was count. */
3135 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3136 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3137#endif
c277df42 3138#if 0
8aa23a47
YO
3139 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3140 regnode *nnxt = regnext(nxt1);
3141
3142 if (nnxt == nxt) {
3143 if (reg_off_by_arg[OP(nxt1)])
3144 ARG_SET(nxt1, nxt2 - nxt1);
3145 else if (nxt2 - nxt1 < U16_MAX)
3146 NEXT_OFF(nxt1) = nxt2 - nxt1;
3147 else
3148 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3149 }
8aa23a47 3150 nxt1 = nnxt;
c277df42 3151 }
5d1c421c 3152#endif
8aa23a47
YO
3153 /* Optimize again: */
3154 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3155 NULL, stopparen, recursed, NULL, 0,depth+1);
3156 }
3157 else
3158 oscan->flags = 0;
3159 }
3160 else if ((OP(oscan) == CURLYX)
3161 && (flags & SCF_WHILEM_VISITED_POS)
3162 /* See the comment on a similar expression above.
3163 However, this time it not a subexpression
3164 we care about, but the expression itself. */
3165 && (maxcount == REG_INFTY)
3166 && data && ++data->whilem_c < 16) {
3167 /* This stays as CURLYX, we can put the count/of pair. */
3168 /* Find WHILEM (as in regexec.c) */
3169 regnode *nxt = oscan + NEXT_OFF(oscan);
3170
3171 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3172 nxt += ARG(nxt);
3173 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3174 | (RExC_whilem_seen << 4)); /* On WHILEM */
3175 }
3176 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3177 pars++;
3178 if (flags & SCF_DO_SUBSTR) {
3179 SV *last_str = NULL;
3180 int counted = mincount != 0;
a0ed51b3 3181
8aa23a47
YO
3182 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3183#if defined(SPARC64_GCC_WORKAROUND)
3184 I32 b = 0;
3185 STRLEN l = 0;
3186 const char *s = NULL;
3187 I32 old = 0;
b515a41d 3188
8aa23a47
YO
3189 if (pos_before >= data->last_start_min)
3190 b = pos_before;
3191 else
3192 b = data->last_start_min;
b515a41d 3193
8aa23a47
YO
3194 l = 0;
3195 s = SvPV_const(data->last_found, l);
3196 old = b - data->last_start_min;
3197
3198#else
3199 I32 b = pos_before >= data->last_start_min
3200 ? pos_before : data->last_start_min;
3201 STRLEN l;
3202 const char * const s = SvPV_const(data->last_found, l);
3203 I32 old = b - data->last_start_min;
3204#endif
3205
3206 if (UTF)
3207 old = utf8_hop((U8*)s, old) - (U8*)s;
3208
3209 l -= old;
3210 /* Get the added string: */
3211 last_str = newSVpvn(s + old, l);
3212 if (UTF)
3213 SvUTF8_on(last_str);
3214 if (deltanext == 0 && pos_before == b) {
3215 /* What was added is a constant string */
3216 if (mincount > 1) {
3217 SvGROW(last_str, (mincount * l) + 1);
3218 repeatcpy(SvPVX(last_str) + l,
3219 SvPVX_const(last_str), l, mincount - 1);
3220 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3221 /* Add additional parts. */
3222 SvCUR_set(data->last_found,
3223 SvCUR(data->last_found) - l);
3224 sv_catsv(data->last_found, last_str);
3225 {
3226 SV * sv = data->last_found;
3227 MAGIC *mg =
3228 SvUTF8(sv) && SvMAGICAL(sv) ?
3229 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3230 if (mg && mg->mg_len >= 0)
3231 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3232 }
8aa23a47 3233 data->last_end += l * (mincount - 1);
b515a41d 3234 }
8aa23a47
YO
3235 } else {
3236 /* start offset must point into the last copy */
3237 data->last_start_min += minnext * (mincount - 1);
3238 data->last_start_max += is_inf ? I32_MAX
3239 : (maxcount - 1) * (minnext + data->pos_delta);
3240 }
c277df42 3241 }
8aa23a47
YO
3242 /* It is counted once already... */
3243 data->pos_min += minnext * (mincount - counted);
3244 data->pos_delta += - counted * deltanext +
3245 (minnext + deltanext) * maxcount - minnext * mincount;
3246 if (mincount != maxcount) {
3247 /* Cannot extend fixed substrings found inside
3248 the group. */
304ee84b 3249 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3250 if (mincount && last_str) {
3251 SV * const sv = data->last_found;
3252 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3253 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3254
3255 if (mg)
3256 mg->mg_len = -1;
3257 sv_setsv(sv, last_str);
3258 data->last_end = data->pos_min;
3259 data->last_start_min =
3260 data->pos_min - CHR_SVLEN(last_str);
3261 data->last_start_max = is_inf
3262 ? I32_MAX
3263 : data->pos_min + data->pos_delta
3264 - CHR_SVLEN(last_str);
3265 }
3266 data->longest = &(data->longest_float);
3267 }
3268 SvREFCNT_dec(last_str);
c277df42 3269 }
8aa23a47
YO
3270 if (data && (fl & SF_HAS_EVAL))
3271 data->flags |= SF_HAS_EVAL;
3272 optimize_curly_tail:
3273 if (OP(oscan) != CURLYX) {
3274 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3275 && NEXT_OFF(next))
3276 NEXT_OFF(oscan) += NEXT_OFF(next);
3277 }
3278 continue;
3279 default: /* REF and CLUMP only? */
3280 if (flags & SCF_DO_SUBSTR) {
304ee84b 3281 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3282 data->longest = &(data->longest_float);
3283 }
3284 is_inf = is_inf_internal = 1;
3285 if (flags & SCF_DO_STCLASS_OR)
3286 cl_anything(pRExC_state, data->start_class);
3287 flags &= ~SCF_DO_STCLASS;
3288 break;
c277df42 3289 }
8aa23a47
YO
3290 }
3291 else if (strchr((const char*)PL_simple,OP(scan))) {
3292 int value = 0;
653099ff 3293
8aa23a47 3294 if (flags & SCF_DO_SUBSTR) {
304ee84b 3295 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3296 data->pos_min++;
3297 }
3298 min++;
3299 if (flags & SCF_DO_STCLASS) {
3300 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3301
8aa23a47
YO
3302 /* Some of the logic below assumes that switching
3303 locale on will only add false positives. */
3304 switch (PL_regkind[OP(scan)]) {
3305 case SANY:
3306 default:
3307 do_default:
3308 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3309 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3310 cl_anything(pRExC_state, data->start_class);
3311 break;
3312 case REG_ANY:
3313 if (OP(scan) == SANY)
3314 goto do_default;
3315 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3316 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3317 || (data->start_class->flags & ANYOF_CLASS));
3318 cl_anything(pRExC_state, data->start_class);
653099ff 3319 }
8aa23a47
YO
3320 if (flags & SCF_DO_STCLASS_AND || !value)
3321 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3322 break;
3323 case ANYOF:
3324 if (flags & SCF_DO_STCLASS_AND)
3325 cl_and(data->start_class,
3326 (struct regnode_charclass_class*)scan);
653099ff 3327 else
8aa23a47
YO
3328 cl_or(pRExC_state, data->start_class,
3329 (struct regnode_charclass_class*)scan);
3330 break;
3331 case ALNUM:
3332 if (flags & SCF_DO_STCLASS_AND) {
3333 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3334 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3335 for (value = 0; value < 256; value++)
3336 if (!isALNUM(value))
3337 ANYOF_BITMAP_CLEAR(data->start_class, value);
3338 }
653099ff 3339 }
8aa23a47
YO
3340 else {
3341 if (data->start_class->flags & ANYOF_LOCALE)
3342 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3343 else {
3344 for (value = 0; value < 256; value++)
3345 if (isALNUM(value))
3346 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3347 }
8aa23a47
YO
3348 }
3349 break;
3350 case ALNUML:
3351 if (flags & SCF_DO_STCLASS_AND) {
3352 if (data->start_class->flags & ANYOF_LOCALE)
3353 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3354 }
3355 else {
3356 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3357 data->start_class->flags |= ANYOF_LOCALE;
3358 }
3359 break;
3360 case NALNUM:
3361 if (flags & SCF_DO_STCLASS_AND) {
3362 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3363 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3364 for (value = 0; value < 256; value++)
3365 if (isALNUM(value))
3366 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3367 }
3368 }
8aa23a47
YO
3369 else {
3370 if (data->start_class->flags & ANYOF_LOCALE)
3371 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3372 else {
3373 for (value = 0; value < 256; value++)
3374 if (!isALNUM(value))
3375 ANYOF_BITMAP_SET(data->start_class, value);
3376 }
653099ff 3377 }
8aa23a47
YO
3378 break;
3379 case NALNUML:
3380 if (flags & SCF_DO_STCLASS_AND) {
3381 if (data->start_class->flags & ANYOF_LOCALE)
3382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3383 }
8aa23a47
YO
3384 else {
3385 data->start_class->flags |= ANYOF_LOCALE;
3386 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3387 }
3388 break;
3389 case SPACE:
3390 if (flags & SCF_DO_STCLASS_AND) {
3391 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3392 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3393 for (value = 0; value < 256; value++)
3394 if (!isSPACE(value))
3395 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3396 }
3397 }
8aa23a47
YO
3398 else {
3399 if (data->start_class->flags & ANYOF_LOCALE)
3400 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3401 else {
3402 for (value = 0; value < 256; value++)
3403 if (isSPACE(value))
3404 ANYOF_BITMAP_SET(data->start_class, value);
3405 }
653099ff 3406 }
8aa23a47
YO
3407 break;
3408 case SPACEL:
3409 if (flags & SCF_DO_STCLASS_AND) {
3410 if (data->start_class->flags & ANYOF_LOCALE)
3411 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3412 }
3413 else {
3414 data->start_class->flags |= ANYOF_LOCALE;
3415 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3416 }
3417 break;
3418 case NSPACE:
3419 if (flags & SCF_DO_STCLASS_AND) {
3420 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3421 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3422 for (value = 0; value < 256; value++)
3423 if (isSPACE(value))
3424 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3425 }
8aa23a47
YO
3426 }
3427 else {
3428 if (data->start_class->flags & ANYOF_LOCALE)
3429 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3430 else {
3431 for (value = 0; value < 256; value++)
3432 if (!isSPACE(value))
3433 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3434 }
3435 }
8aa23a47
YO
3436 break;
3437 case NSPACEL:
3438 if (flags & SCF_DO_STCLASS_AND) {
3439 if (data->start_class->flags & ANYOF_LOCALE) {
3440 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3441 for (value = 0; value < 256; value++)
3442 if (!isSPACE(value))
3443 ANYOF_BITMAP_CLEAR(data->start_class, value);
3444 }
653099ff 3445 }
8aa23a47
YO
3446 else {
3447 data->start_class->flags |= ANYOF_LOCALE;
3448 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3449 }
3450 break;
3451 case DIGIT:
3452 if (flags & SCF_DO_STCLASS_AND) {
3453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3454 for (value = 0; value < 256; value++)
3455 if (!isDIGIT(value))
3456 ANYOF_BITMAP_CLEAR(data->start_class, value);
3457 }
3458 else {
3459 if (data->start_class->flags & ANYOF_LOCALE)
3460 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3461 else {
3462 for (value = 0; value < 256; value++)
3463 if (isDIGIT(value))
3464 ANYOF_BITMAP_SET(data->start_class, value);
3465 }
3466 }
3467 break;
3468 case NDIGIT:
3469 if (flags & SCF_DO_STCLASS_AND) {
3470 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3471 for (value = 0; value < 256; value++)
3472 if (isDIGIT(value))
3473 ANYOF_BITMAP_CLEAR(data->start_class, value);
3474 }
3475 else {
3476 if (data->start_class->flags & ANYOF_LOCALE)
3477 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3478 else {
3479 for (value = 0; value < 256; value++)
3480 if (!isDIGIT(value))
3481 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3482 }
3483 }
8aa23a47
YO
3484 break;
3485 }
3486 if (flags & SCF_DO_STCLASS_OR)
3487 cl_and(data->start_class, and_withp);
3488 flags &= ~SCF_DO_STCLASS;
3489 }
3490 }
3491 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3492 data->flags |= (OP(scan) == MEOL
3493 ? SF_BEFORE_MEOL
3494 : SF_BEFORE_SEOL);
3495 }
3496 else if ( PL_regkind[OP(scan)] == BRANCHJ
3497 /* Lookbehind, or need to calculate parens/evals/stclass: */
3498 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3499 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3500 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3501 || OP(scan) == UNLESSM )
3502 {
3503 /* Negative Lookahead/lookbehind
3504 In this case we can't do fixed string optimisation.
3505 */
1de06328 3506
8aa23a47
YO
3507 I32 deltanext, minnext, fake = 0;
3508 regnode *nscan;
3509 struct regnode_charclass_class intrnl;
3510 int f = 0;
1de06328 3511
8aa23a47
YO
3512 data_fake.flags = 0;
3513 if (data) {
3514 data_fake.whilem_c = data->whilem_c;
3515 data_fake.last_closep = data->last_closep;
c277df42 3516 }
8aa23a47
YO
3517 else
3518 data_fake.last_closep = &fake;
58e23c8d 3519 data_fake.pos_delta = delta;
8aa23a47
YO
3520 if ( flags & SCF_DO_STCLASS && !scan->flags
3521 && OP(scan) == IFMATCH ) { /* Lookahead */
3522 cl_init(pRExC_state, &intrnl);
3523 data_fake.start_class = &intrnl;
3524 f |= SCF_DO_STCLASS_AND;
3525 }
3526 if (flags & SCF_WHILEM_VISITED_POS)
3527 f |= SCF_WHILEM_VISITED_POS;
3528 next = regnext(scan);
3529 nscan = NEXTOPER(NEXTOPER(scan));
3530 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3531 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3532 if (scan->flags) {
3533 if (deltanext) {
58e23c8d 3534 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3535 }
3536 else if (minnext > (I32)U8_MAX) {
58e23c8d 3537 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3538 }
3539 scan->flags = (U8)minnext;
3540 }
3541 if (data) {
3542 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3543 pars++;
3544 if (data_fake.flags & SF_HAS_EVAL)
3545 data->flags |= SF_HAS_EVAL;
3546 data->whilem_c = data_fake.whilem_c;
3547 }
3548 if (f & SCF_DO_STCLASS_AND) {
3549 const int was = (data->start_class->flags & ANYOF_EOS);
3550
3551 cl_and(data->start_class, &intrnl);
3552 if (was)
3553 data->start_class->flags |= ANYOF_EOS;
3554 }
cb434fcc 3555 }
8aa23a47
YO
3556#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3557 else {
3558 /* Positive Lookahead/lookbehind
3559 In this case we can do fixed string optimisation,
3560 but we must be careful about it. Note in the case of
3561 lookbehind the positions will be offset by the minimum
3562 length of the pattern, something we won't know about
3563 until after the recurse.
3564 */
3565 I32 deltanext, fake = 0;
3566 regnode *nscan;
3567 struct regnode_charclass_class intrnl;
3568 int f = 0;
3569 /* We use SAVEFREEPV so that when the full compile
3570 is finished perl will clean up the allocated
3571 minlens when its all done. This was we don't
3572 have to worry about freeing them when we know
3573 they wont be used, which would be a pain.
3574 */
3575 I32 *minnextp;
3576 Newx( minnextp, 1, I32 );
3577 SAVEFREEPV(minnextp);
3578
3579 if (data) {
3580 StructCopy(data, &data_fake, scan_data_t);
3581 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3582 f |= SCF_DO_SUBSTR;
3583 if (scan->flags)
304ee84b 3584 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3585 data_fake.last_found=newSVsv(data->last_found);
3586 }
3587 }
3588 else
3589 data_fake.last_closep = &fake;
3590 data_fake.flags = 0;
58e23c8d 3591 data_fake.pos_delta = delta;
8aa23a47
YO
3592 if (is_inf)
3593 data_fake.flags |= SF_IS_INF;
3594 if ( flags & SCF_DO_STCLASS && !scan->flags
3595 && OP(scan) == IFMATCH ) { /* Lookahead */
3596 cl_init(pRExC_state, &intrnl);
3597 data_fake.start_class = &intrnl;
3598 f |= SCF_DO_STCLASS_AND;
3599 }
3600 if (flags & SCF_WHILEM_VISITED_POS)
3601 f |= SCF_WHILEM_VISITED_POS;
3602 next = regnext(scan);
3603 nscan = NEXTOPER(NEXTOPER(scan));
3604
3605 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3606 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3607 if (scan->flags) {
3608 if (deltanext) {
58e23c8d 3609 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3610 }
3611 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3612 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3613 }
3614 scan->flags = (U8)*minnextp;
3615 }
3616
3617 *minnextp += min;
3618
3619 if (f & SCF_DO_STCLASS_AND) {
3620 const int was = (data->start_class->flags & ANYOF_EOS);
3621
3622 cl_and(data->start_class, &intrnl);
3623 if (was)
3624 data->start_class->flags |= ANYOF_EOS;
3625 }
3626 if (data) {
3627 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3628 pars++;
3629 if (data_fake.flags & SF_HAS_EVAL)
3630 data->flags |= SF_HAS_EVAL;
3631 data->whilem_c = data_fake.whilem_c;
3632 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3633 if (RExC_rx->minlen<*minnextp)
3634 RExC_rx->minlen=*minnextp;
304ee84b 3635 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3636 SvREFCNT_dec(data_fake.last_found);
3637
3638 if ( data_fake.minlen_fixed != minlenp )
3639 {
3640 data->offset_fixed= data_fake.offset_fixed;
3641 data->minlen_fixed= data_fake.minlen_fixed;
3642 data->lookbehind_fixed+= scan->flags;
3643 }
3644 if ( data_fake.minlen_float != minlenp )
3645 {
3646 data->minlen_float= data_fake.minlen_float;
3647 data->offset_float_min=data_fake.offset_float_min;
3648 data->offset_float_max=data_fake.offset_float_max;
3649 data->lookbehind_float+= scan->flags;
3650 }
3651 }
3652 }
3653
3654
40d049e4 3655 }
8aa23a47
YO
3656#endif
3657 }
3658 else if (OP(scan) == OPEN) {
3659 if (stopparen != (I32)ARG(scan))
3660 pars++;
3661 }
3662 else if (OP(scan) == CLOSE) {
3663 if (stopparen == (I32)ARG(scan)) {
3664 break;
3665 }
3666 if ((I32)ARG(scan) == is_par) {
3667 next = regnext(scan);
b515a41d 3668
8aa23a47
YO
3669 if ( next && (OP(next) != WHILEM) && next < last)
3670 is_par = 0; /* Disable optimization */
40d049e4 3671 }
8aa23a47
YO
3672 if (data)
3673 *(data->last_closep) = ARG(scan);
3674 }
3675 else if (OP(scan) == EVAL) {
c277df42
IZ
3676 if (data)
3677 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3678 }
3679 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3680 if (flags & SCF_DO_SUBSTR) {
304ee84b 3681 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3682 flags &= ~SCF_DO_SUBSTR;
40d049e4 3683 }
8aa23a47
YO
3684 if (data && OP(scan)==ACCEPT) {
3685 data->flags |= SCF_SEEN_ACCEPT;
3686 if (stopmin > min)
3687 stopmin = min;
e2e6a0f1 3688 }
8aa23a47
YO
3689 }
3690 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3691 {
0f5d15d6 3692 if (flags & SCF_DO_SUBSTR) {
304ee84b 3693 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3694 data->longest = &(data->longest_float);
3695 }
3696 is_inf = is_inf_internal = 1;
653099ff 3697 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3698 cl_anything(pRExC_state, data->start_class);
96776eda 3699 flags &= ~SCF_DO_STCLASS;
8aa23a47 3700 }
58e23c8d 3701 else if (OP(scan) == GPOS) {
bbe252da 3702 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3703 !(delta || is_inf || (data && data->pos_delta)))
3704 {
bbe252da
YO
3705 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3706 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3707 if (RExC_rx->gofs < (U32)min)
3708 RExC_rx->gofs = min;
3709 } else {
bbe252da 3710 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3711 RExC_rx->gofs = 0;
3712 }
3713 }
786e8c11 3714#ifdef TRIE_STUDY_OPT
40d049e4 3715#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3716 else if (PL_regkind[OP(scan)] == TRIE) {
3717 /* NOTE - There is similar code to this block above for handling
3718 BRANCH nodes on the initial study. If you change stuff here
3719 check there too. */
3720 regnode *trie_node= scan;
3721 regnode *tail= regnext(scan);
f8fc2ecf 3722 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3723 I32 max1 = 0, min1 = I32_MAX;
3724 struct regnode_charclass_class accum;
3725
3726 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3727 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3728 if (flags & SCF_DO_STCLASS)
3729 cl_init_zero(pRExC_state, &accum);
3730
3731 if (!trie->jump) {
3732 min1= trie->minlen;
3733 max1= trie->maxlen;
3734 } else {
3735 const regnode *nextbranch= NULL;
3736 U32 word;
3737
3738 for ( word=1 ; word <= trie->wordcount ; word++)
3739 {
3740 I32 deltanext=0, minnext=0, f = 0, fake;
3741 struct regnode_charclass_class this_class;
3742
3743 data_fake.flags = 0;
3744 if (data) {
3745 data_fake.whilem_c = data->whilem_c;
3746 data_fake.last_closep = data->last_closep;
3747 }
3748 else
3749 data_fake.last_closep = &fake;
58e23c8d 3750 data_fake.pos_delta = delta;
8aa23a47
YO
3751 if (flags & SCF_DO_STCLASS) {
3752 cl_init(pRExC_state, &this_class);
3753 data_fake.start_class = &this_class;
3754 f = SCF_DO_STCLASS_AND;
3755 }
3756 if (flags & SCF_WHILEM_VISITED_POS)
3757 f |= SCF_WHILEM_VISITED_POS;
3758
3759 if (trie->jump[word]) {
3760 if (!nextbranch)
3761 nextbranch = trie_node + trie->jump[0];
3762 scan= trie_node + trie->jump[word];
3763 /* We go from the jump point to the branch that follows
3764 it. Note this means we need the vestigal unused branches
3765 even though they arent otherwise used.
3766 */
3767 minnext = study_chunk(pRExC_state, &scan, minlenp,
3768 &deltanext, (regnode *)nextbranch, &data_fake,
3769 stopparen, recursed, NULL, f,depth+1);
3770 }
3771 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3772 nextbranch= regnext((regnode*)nextbranch);
3773
3774 if (min1 > (I32)(minnext + trie->minlen))
3775 min1 = minnext + trie->minlen;
3776 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3777 max1 = minnext + deltanext + trie->maxlen;
3778 if (deltanext == I32_MAX)
3779 is_inf = is_inf_internal = 1;
3780
3781 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3782 pars++;
3783 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3784 if ( stopmin > min + min1)
3785 stopmin = min + min1;
3786 flags &= ~SCF_DO_SUBSTR;
3787 if (data)
3788 data->flags |= SCF_SEEN_ACCEPT;
3789 }
3790 if (data) {
3791 if (data_fake.flags & SF_HAS_EVAL)
3792 data->flags |= SF_HAS_EVAL;
3793 data->whilem_c = data_fake.whilem_c;
3794 }
3795 if (flags & SCF_DO_STCLASS)
3796 cl_or(pRExC_state, &accum, &this_class);
3797 }
3798 }
3799 if (flags & SCF_DO_SUBSTR) {
3800 data->pos_min += min1;
3801 data->pos_delta += max1 - min1;
3802 if (max1 != min1 || is_inf)
3803 data->longest = &(data->longest_float);
3804 }
3805 min += min1;
3806 delta += max1 - min1;
3807 if (flags & SCF_DO_STCLASS_OR) {
3808 cl_or(pRExC_state, data->start_class, &accum);
3809 if (min1) {
3810 cl_and(data->start_class, and_withp);
3811 flags &= ~SCF_DO_STCLASS;
3812 }
3813 }
3814 else if (flags & SCF_DO_STCLASS_AND) {
3815 if (min1) {
3816 cl_and(data->start_class, &accum);
3817 flags &= ~SCF_DO_STCLASS;
3818 }
3819 else {
3820 /* Switch to OR mode: cache the old value of
3821