This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Avoid compiler warning about a variable that isn't used when
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
786e8c11 174#define FULL_TRIE_STUDY
07be1b83
YO
175#define TRIE_STCLASS
176#endif
1de06328
YO
177
178
179/* About scan_data_t.
180
181 During optimisation we recurse through the regexp program performing
182 various inplace (keyhole style) optimisations. In addition study_chunk
183 and scan_commit populate this data structure with information about
184 what strings MUST appear in the pattern. We look for the longest
185 string that must appear for at a fixed location, and we look for the
186 longest string that may appear at a floating location. So for instance
187 in the pattern:
188
189 /FOO[xX]A.*B[xX]BAR/
190
191 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
192 strings (because they follow a .* construct). study_chunk will identify
193 both FOO and BAR as being the longest fixed and floating strings respectively.
194
195 The strings can be composites, for instance
196
197 /(f)(o)(o)/
198
199 will result in a composite fixed substring 'foo'.
200
201 For each string some basic information is maintained:
202
203 - offset or min_offset
204 This is the position the string must appear at, or not before.
205 It also implicitly (when combined with minlenp) tells us how many
206 character must match before the string we are searching.
207 Likewise when combined with minlenp and the length of the string
208 tells us how many characters must appear after the string we have
209 found.
210
211 - max_offset
212 Only used for floating strings. This is the rightmost point that
213 the string can appear at. Ifset to I32 max it indicates that the
214 string can occur infinitely far to the right.
215
216 - minlenp
217 A pointer to the minimum length of the pattern that the string
218 was found inside. This is important as in the case of positive
219 lookahead or positive lookbehind we can have multiple patterns
220 involved. Consider
221
222 /(?=FOO).*F/
223
224 The minimum length of the pattern overall is 3, the minimum length
225 of the lookahead part is 3, but the minimum length of the part that
226 will actually match is 1. So 'FOO's minimum length is 3, but the
227 minimum length for the F is 1. This is important as the minimum length
228 is used to determine offsets in front of and behind the string being
229 looked for. Since strings can be composites this is the length of the
230 pattern at the time it was commited with a scan_commit. Note that
231 the length is calculated by study_chunk, so that the minimum lengths
232 are not known until the full pattern has been compiled, thus the
233 pointer to the value.
234
235 - lookbehind
236
237 In the case of lookbehind the string being searched for can be
238 offset past the start point of the final matching string.
239 If this value was just blithely removed from the min_offset it would
240 invalidate some of the calculations for how many chars must match
241 before or after (as they are derived from min_offset and minlen and
242 the length of the string being searched for).
243 When the final pattern is compiled and the data is moved from the
244 scan_data_t structure into the regexp structure the information
245 about lookbehind is factored in, with the information that would
246 have been lost precalculated in the end_shift field for the
247 associated string.
248
249 The fields pos_min and pos_delta are used to store the minimum offset
250 and the delta to the maximum offset at the current point in the pattern.
251
252*/
2c2d71f5
JH
253
254typedef struct scan_data_t {
1de06328
YO
255 /*I32 len_min; unused */
256 /*I32 len_delta; unused */
2c2d71f5
JH
257 I32 pos_min;
258 I32 pos_delta;
259 SV *last_found;
1de06328 260 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
261 I32 last_start_min;
262 I32 last_start_max;
1de06328
YO
263 SV **longest; /* Either &l_fixed, or &l_float. */
264 SV *longest_fixed; /* longest fixed string found in pattern */
265 I32 offset_fixed; /* offset where it starts */
266 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
267 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
268 SV *longest_float; /* longest floating string found in pattern */
269 I32 offset_float_min; /* earliest point in string it can appear */
270 I32 offset_float_max; /* latest point in string it can appear */
271 I32 *minlen_float; /* pointer to the minlen relevent to the string */
272 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
273 I32 flags;
274 I32 whilem_c;
cb434fcc 275 I32 *last_closep;
653099ff 276 struct regnode_charclass_class *start_class;
2c2d71f5
JH
277} scan_data_t;
278
a687059c 279/*
e50aee73 280 * Forward declarations for pregcomp()'s friends.
a687059c 281 */
a0d0e21e 282
27da23d5 283static const scan_data_t zero_scan_data =
1de06328 284 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
285
286#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
287#define SF_BEFORE_SEOL 0x0001
288#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
289#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
290#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
291
09b7f37c
CB
292#ifdef NO_UNARY_PLUS
293# define SF_FIX_SHIFT_EOL (0+2)
294# define SF_FL_SHIFT_EOL (0+4)
295#else
296# define SF_FIX_SHIFT_EOL (+2)
297# define SF_FL_SHIFT_EOL (+4)
298#endif
c277df42
IZ
299
300#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
301#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
302
303#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
304#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
305#define SF_IS_INF 0x0040
306#define SF_HAS_PAR 0x0080
307#define SF_IN_PAR 0x0100
308#define SF_HAS_EVAL 0x0200
309#define SCF_DO_SUBSTR 0x0400
653099ff
GS
310#define SCF_DO_STCLASS_AND 0x0800
311#define SCF_DO_STCLASS_OR 0x1000
312#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 313#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 314
786e8c11
YO
315#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
316
07be1b83 317
eb160463 318#define UTF (RExC_utf8 != 0)
e2509266
JH
319#define LOC ((RExC_flags & PMf_LOCALE) != 0)
320#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 321
ffc61ed2 322#define OOB_UNICODE 12345678
93733859 323#define OOB_NAMEDCLASS -1
b8c5462f 324
a0ed51b3
LW
325#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
326#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
327
8615cb43 328
b45f050a
JF
329/* length of regex to show in messages that don't mark a position within */
330#define RegexLengthToShowInErrorMessages 127
331
332/*
333 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
334 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
335 * op/pragma/warn/regcomp.
336 */
7253e4e3
RK
337#define MARKER1 "<-- HERE" /* marker as it appears in the description */
338#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 339
7253e4e3 340#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
341
342/*
343 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
344 * arg. Show regex, up to a maximum length. If it's too long, chop and add
345 * "...".
346 */
ccb2c380 347#define FAIL(msg) STMT_START { \
bfed75c6 348 const char *ellipses = ""; \
ccb2c380
MP
349 IV len = RExC_end - RExC_precomp; \
350 \
351 if (!SIZE_ONLY) \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 if (len > RegexLengthToShowInErrorMessages) { \
354 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
355 len = RegexLengthToShowInErrorMessages - 10; \
356 ellipses = "..."; \
357 } \
358 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
359 msg, (int)len, RExC_precomp, ellipses); \
360} STMT_END
8615cb43 361
b45f050a 362/*
b45f050a
JF
363 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
364 */
ccb2c380 365#define Simple_vFAIL(m) STMT_START { \
a28509cc 366 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
367 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
368 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
b45f050a
JF
370
371/*
372 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
373 */
ccb2c380
MP
374#define vFAIL(m) STMT_START { \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 Simple_vFAIL(m); \
378} STMT_END
b45f050a
JF
379
380/*
381 * Like Simple_vFAIL(), but accepts two arguments.
382 */
ccb2c380 383#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 384 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
385 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
386 (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
b45f050a
JF
388
389/*
390 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
391 */
ccb2c380
MP
392#define vFAIL2(m,a1) STMT_START { \
393 if (!SIZE_ONLY) \
394 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
395 Simple_vFAIL2(m, a1); \
396} STMT_END
b45f050a
JF
397
398
399/*
400 * Like Simple_vFAIL(), but accepts three arguments.
401 */
ccb2c380 402#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 403 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
404 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
b45f050a
JF
407
408/*
409 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
410 */
ccb2c380
MP
411#define vFAIL3(m,a1,a2) STMT_START { \
412 if (!SIZE_ONLY) \
413 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 Simple_vFAIL3(m, a1, a2); \
415} STMT_END
b45f050a
JF
416
417/*
418 * Like Simple_vFAIL(), but accepts four arguments.
419 */
ccb2c380 420#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 421 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
422 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
423 (int)offset, RExC_precomp, RExC_precomp + offset); \
424} STMT_END
b45f050a 425
ccb2c380 426#define vWARN(loc,m) STMT_START { \
a28509cc 427 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
431
432#define vWARNdep(loc,m) STMT_START { \
a28509cc 433 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
434 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
435 "%s" REPORT_LOCATION, \
436 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
438
439
440#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 441 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
442 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
443 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
445
446#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 447 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
448 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
449 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
450} STMT_END
451
452#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 453 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
455 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
457
458#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 459 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
460 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
461 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
9d1d55b5 463
8615cb43 464
cd439c50 465/* Allow for side effects in s */
ccb2c380
MP
466#define REGC(c,s) STMT_START { \
467 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
468} STMT_END
cd439c50 469
fac92740
MJD
470/* Macros for recording node offsets. 20001227 mjd@plover.com
471 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
472 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
473 * Element 0 holds the number n.
07be1b83 474 * Position is 1 indexed.
fac92740
MJD
475 */
476
ccb2c380
MP
477#define Set_Node_Offset_To_R(node,byte) STMT_START { \
478 if (! SIZE_ONLY) { \
479 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 480 __LINE__, (node), (int)(byte))); \
ccb2c380 481 if((node) < 0) { \
551405c4 482 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
483 } else { \
484 RExC_offsets[2*(node)-1] = (byte); \
485 } \
486 } \
487} STMT_END
488
489#define Set_Node_Offset(node,byte) \
490 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
491#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
492
493#define Set_Node_Length_To_R(node,len) STMT_START { \
494 if (! SIZE_ONLY) { \
495 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 496 __LINE__, (int)(node), (int)(len))); \
ccb2c380 497 if((node) < 0) { \
551405c4 498 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
499 } else { \
500 RExC_offsets[2*(node)] = (len); \
501 } \
502 } \
503} STMT_END
504
505#define Set_Node_Length(node,len) \
506 Set_Node_Length_To_R((node)-RExC_emit_start, len)
507#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
508#define Set_Node_Cur_Length(node) \
509 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
510
511/* Get offsets and lengths */
512#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
513#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
514
07be1b83
YO
515#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
517 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
518} STMT_END
519
520
521#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
522#define EXPERIMENTAL_INPLACESCAN
523#endif
524
1de06328
YO
525#define DEBUG_STUDYDATA(data,depth) \
526DEBUG_OPTIMISE_r(if(data){ \
527 PerlIO_printf(Perl_debug_log, \
528 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
529 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
530 (int)(depth)*2, "", \
531 (IV)((data)->pos_min), \
532 (IV)((data)->pos_delta), \
533 (IV)((data)->flags), \
534 (IV)((data)->whilem_c), \
535 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
536 ); \
537 if ((data)->last_found) \
538 PerlIO_printf(Perl_debug_log, \
539 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
540 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
541 SvPVX_const((data)->last_found), \
542 (IV)((data)->last_end), \
543 (IV)((data)->last_start_min), \
544 (IV)((data)->last_start_max), \
545 ((data)->longest && \
546 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
547 SvPVX_const((data)->longest_fixed), \
548 (IV)((data)->offset_fixed), \
549 ((data)->longest && \
550 (data)->longest==&((data)->longest_float)) ? "*" : "", \
551 SvPVX_const((data)->longest_float), \
552 (IV)((data)->offset_float_min), \
553 (IV)((data)->offset_float_max) \
554 ); \
555 PerlIO_printf(Perl_debug_log,"\n"); \
556});
557
acfe0abc 558static void clear_re(pTHX_ void *r);
4327152a 559
653099ff 560/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 561 Update the longest found anchored substring and the longest found
653099ff
GS
562 floating substrings if needed. */
563
4327152a 564STATIC void
1de06328 565S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 566{
e1ec3a88
AL
567 const STRLEN l = CHR_SVLEN(data->last_found);
568 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 569 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 570
c277df42 571 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 572 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
573 if (*data->longest == data->longest_fixed) {
574 data->offset_fixed = l ? data->last_start_min : data->pos_min;
575 if (data->flags & SF_BEFORE_EOL)
b81d288d 576 data->flags
c277df42
IZ
577 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
578 else
579 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
580 data->minlen_fixed=minlenp;
581 data->lookbehind_fixed=0;
a0ed51b3
LW
582 }
583 else {
c277df42 584 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
585 data->offset_float_max = (l
586 ? data->last_start_max
c277df42 587 : data->pos_min + data->pos_delta);
9051bda5
HS
588 if ((U32)data->offset_float_max > (U32)I32_MAX)
589 data->offset_float_max = I32_MAX;
c277df42 590 if (data->flags & SF_BEFORE_EOL)
b81d288d 591 data->flags
c277df42
IZ
592 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
593 else
594 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
595 data->minlen_float=minlenp;
596 data->lookbehind_float=0;
c277df42
IZ
597 }
598 }
599 SvCUR_set(data->last_found, 0);
0eda9292 600 {
a28509cc 601 SV * const sv = data->last_found;
097eb12c
AL
602 if (SvUTF8(sv) && SvMAGICAL(sv)) {
603 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
604 if (mg)
605 mg->mg_len = 0;
606 }
0eda9292 607 }
c277df42
IZ
608 data->last_end = -1;
609 data->flags &= ~SF_BEFORE_EOL;
1de06328 610 DEBUG_STUDYDATA(data,0);
c277df42
IZ
611}
612
653099ff
GS
613/* Can match anything (initialization) */
614STATIC void
097eb12c 615S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 616{
653099ff 617 ANYOF_CLASS_ZERO(cl);
f8bef550 618 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 619 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
620 if (LOC)
621 cl->flags |= ANYOF_LOCALE;
622}
623
624/* Can match anything (initialization) */
625STATIC int
5f66b61c 626S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
627{
628 int value;
629
aaa51d5e 630 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
631 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
632 return 1;
1aa99e6b
IH
633 if (!(cl->flags & ANYOF_UNICODE_ALL))
634 return 0;
10edeb5d 635 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 636 return 0;
653099ff
GS
637 return 1;
638}
639
640/* Can match anything (initialization) */
641STATIC void
097eb12c 642S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 643{
8ecf7187 644 Zero(cl, 1, struct regnode_charclass_class);
653099ff 645 cl->type = ANYOF;
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647}
648
649STATIC void
097eb12c 650S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 651{
8ecf7187 652 Zero(cl, 1, struct regnode_charclass_class);
653099ff 653 cl->type = ANYOF;
830247a4 654 cl_anything(pRExC_state, cl);
653099ff
GS
655 if (LOC)
656 cl->flags |= ANYOF_LOCALE;
657}
658
659/* 'And' a given class with another one. Can create false positives */
660/* We assume that cl is not inverted */
661STATIC void
5f66b61c 662S_cl_and(struct regnode_charclass_class *cl,
a28509cc 663 const struct regnode_charclass_class *and_with)
653099ff 664{
653099ff
GS
665 if (!(and_with->flags & ANYOF_CLASS)
666 && !(cl->flags & ANYOF_CLASS)
667 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
668 && !(and_with->flags & ANYOF_FOLD)
669 && !(cl->flags & ANYOF_FOLD)) {
670 int i;
671
672 if (and_with->flags & ANYOF_INVERT)
673 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
674 cl->bitmap[i] &= ~and_with->bitmap[i];
675 else
676 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
677 cl->bitmap[i] &= and_with->bitmap[i];
678 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
679 if (!(and_with->flags & ANYOF_EOS))
680 cl->flags &= ~ANYOF_EOS;
1aa99e6b 681
14ebb1a2
JH
682 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
683 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
684 cl->flags &= ~ANYOF_UNICODE_ALL;
685 cl->flags |= ANYOF_UNICODE;
686 ARG_SET(cl, ARG(and_with));
687 }
14ebb1a2
JH
688 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
689 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 690 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
691 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
692 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 693 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
694}
695
696/* 'OR' a given class with another one. Can create false positives */
697/* We assume that cl is not inverted */
698STATIC void
097eb12c 699S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 700{
653099ff
GS
701 if (or_with->flags & ANYOF_INVERT) {
702 /* We do not use
703 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
704 * <= (B1 | !B2) | (CL1 | !CL2)
705 * which is wasteful if CL2 is small, but we ignore CL2:
706 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
707 * XXXX Can we handle case-fold? Unclear:
708 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
709 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
710 */
711 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
712 && !(or_with->flags & ANYOF_FOLD)
713 && !(cl->flags & ANYOF_FOLD) ) {
714 int i;
715
716 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
717 cl->bitmap[i] |= ~or_with->bitmap[i];
718 } /* XXXX: logic is complicated otherwise */
719 else {
830247a4 720 cl_anything(pRExC_state, cl);
653099ff
GS
721 }
722 } else {
723 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
724 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 725 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
726 || (cl->flags & ANYOF_FOLD)) ) {
727 int i;
728
729 /* OR char bitmap and class bitmap separately */
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] |= or_with->bitmap[i];
732 if (or_with->flags & ANYOF_CLASS) {
733 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
734 cl->classflags[i] |= or_with->classflags[i];
735 cl->flags |= ANYOF_CLASS;
736 }
737 }
738 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 739 cl_anything(pRExC_state, cl);
653099ff
GS
740 }
741 }
742 if (or_with->flags & ANYOF_EOS)
743 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
744
745 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
746 ARG(cl) != ARG(or_with)) {
747 cl->flags |= ANYOF_UNICODE_ALL;
748 cl->flags &= ~ANYOF_UNICODE;
749 }
750 if (or_with->flags & ANYOF_UNICODE_ALL) {
751 cl->flags |= ANYOF_UNICODE_ALL;
752 cl->flags &= ~ANYOF_UNICODE;
753 }
653099ff
GS
754}
755
a3621e74
YO
756#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
757#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
758#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
759#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
760
3dab1dad
YO
761
762#ifdef DEBUGGING
07be1b83 763/*
3dab1dad
YO
764 dump_trie(trie)
765 dump_trie_interim_list(trie,next_alloc)
766 dump_trie_interim_table(trie,next_alloc)
767
768 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
769 The _interim_ variants are used for debugging the interim
770 tables that are used to generate the final compressed
771 representation which is what dump_trie expects.
772
3dab1dad
YO
773 Part of the reason for their existance is to provide a form
774 of documentation as to how the different representations function.
07be1b83
YO
775
776*/
3dab1dad
YO
777
778/*
779 dump_trie(trie)
780 Dumps the final compressed table form of the trie to Perl_debug_log.
781 Used for debugging make_trie().
782*/
783
784STATIC void
785S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
786{
787 U32 state;
ab3bbdeb
YO
788 SV *sv=sv_newmortal();
789 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
790 GET_RE_DEBUG_FLAGS_DECL;
791
ab3bbdeb 792
3dab1dad
YO
793 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
794 (int)depth * 2 + 2,"",
795 "Match","Base","Ofs" );
796
797 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 798 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 799 if ( tmp ) {
ab3bbdeb
YO
800 PerlIO_printf( Perl_debug_log, "%*s",
801 colwidth,
ddc5bc0f 802 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
803 PL_colors[0], PL_colors[1],
804 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
805 PERL_PV_ESCAPE_FIRSTCHAR
806 )
807 );
3dab1dad
YO
808 }
809 }
810 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
811 (int)depth * 2 + 2,"");
812
813 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 814 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
815 PerlIO_printf( Perl_debug_log, "\n");
816
786e8c11 817 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 818 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
819
820 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
821
822 if ( trie->states[ state ].wordnum ) {
823 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
824 } else {
825 PerlIO_printf( Perl_debug_log, "%6s", "" );
826 }
827
828 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
829
830 if ( base ) {
831 U32 ofs = 0;
832
833 while( ( base + ofs < trie->uniquecharcount ) ||
834 ( base + ofs - trie->uniquecharcount < trie->lasttrans
835 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
836 ofs++;
837
838 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
839
840 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
841 if ( ( base + ofs >= trie->uniquecharcount ) &&
842 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
843 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
844 {
ab3bbdeb
YO
845 PerlIO_printf( Perl_debug_log, "%*"UVXf,
846 colwidth,
3dab1dad
YO
847 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
848 } else {
ab3bbdeb 849 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
850 }
851 }
852
853 PerlIO_printf( Perl_debug_log, "]");
854
855 }
856 PerlIO_printf( Perl_debug_log, "\n" );
857 }
858}
859/*
860 dump_trie_interim_list(trie,next_alloc)
861 Dumps a fully constructed but uncompressed trie in list form.
862 List tries normally only are used for construction when the number of
863 possible chars (trie->uniquecharcount) is very high.
864 Used for debugging make_trie().
865*/
866STATIC void
867S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
868{
869 U32 state;
ab3bbdeb
YO
870 SV *sv=sv_newmortal();
871 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
872 GET_RE_DEBUG_FLAGS_DECL;
873 /* print out the table precompression. */
ab3bbdeb
YO
874 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
875 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
876 "------:-----+-----------------\n" );
3dab1dad
YO
877
878 for( state=1 ; state < next_alloc ; state ++ ) {
879 U16 charid;
880
ab3bbdeb 881 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
882 (int)depth * 2 + 2,"", (UV)state );
883 if ( ! trie->states[ state ].wordnum ) {
884 PerlIO_printf( Perl_debug_log, "%5s| ","");
885 } else {
886 PerlIO_printf( Perl_debug_log, "W%4x| ",
887 trie->states[ state ].wordnum
888 );
889 }
890 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 891 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
892 if ( tmp ) {
893 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
894 colwidth,
ddc5bc0f 895 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
896 PL_colors[0], PL_colors[1],
897 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
898 PERL_PV_ESCAPE_FIRSTCHAR
899 ) ,
3dab1dad
YO
900 TRIE_LIST_ITEM(state,charid).forid,
901 (UV)TRIE_LIST_ITEM(state,charid).newstate
902 );
903 }
ab3bbdeb
YO
904 }
905 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
906 }
907}
908
909/*
910 dump_trie_interim_table(trie,next_alloc)
911 Dumps a fully constructed but uncompressed trie in table form.
912 This is the normal DFA style state transition table, with a few
913 twists to facilitate compression later.
914 Used for debugging make_trie().
915*/
916STATIC void
917S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
918{
919 U32 state;
920 U16 charid;
ab3bbdeb
YO
921 SV *sv=sv_newmortal();
922 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
923 GET_RE_DEBUG_FLAGS_DECL;
924
925 /*
926 print out the table precompression so that we can do a visual check
927 that they are identical.
928 */
929
930 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
931
932 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 933 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 934 if ( tmp ) {
ab3bbdeb
YO
935 PerlIO_printf( Perl_debug_log, "%*s",
936 colwidth,
ddc5bc0f 937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
941 )
942 );
3dab1dad
YO
943 }
944 }
945
946 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
947
948 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 949 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
950 }
951
952 PerlIO_printf( Perl_debug_log, "\n" );
953
954 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
955
956 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
957 (int)depth * 2 + 2,"",
958 (UV)TRIE_NODENUM( state ) );
959
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
961 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
962 if (v)
963 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
964 else
965 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
966 }
967 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
968 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
969 } else {
970 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
971 trie->states[ TRIE_NODENUM( state ) ].wordnum );
972 }
973 }
07be1b83 974}
3dab1dad
YO
975
976#endif
977
786e8c11
YO
978/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
979 startbranch: the first branch in the whole branch sequence
980 first : start branch of sequence of branch-exact nodes.
981 May be the same as startbranch
982 last : Thing following the last branch.
983 May be the same as tail.
984 tail : item following the branch sequence
985 count : words in the sequence
986 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
987 depth : indent depth
3dab1dad 988
786e8c11 989Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 990
786e8c11
YO
991A trie is an N'ary tree where the branches are determined by digital
992decomposition of the key. IE, at the root node you look up the 1st character and
993follow that branch repeat until you find the end of the branches. Nodes can be
994marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 995
786e8c11 996 /he|she|his|hers/
72f13be8 997
786e8c11
YO
998would convert into the following structure. Numbers represent states, letters
999following numbers represent valid transitions on the letter from that state, if
1000the number is in square brackets it represents an accepting state, otherwise it
1001will be in parenthesis.
07be1b83 1002
786e8c11
YO
1003 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1004 | |
1005 | (2)
1006 | |
1007 (1) +-i->(6)-+-s->[7]
1008 |
1009 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1010
786e8c11
YO
1011 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1012
1013This shows that when matching against the string 'hers' we will begin at state 1
1014read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1015then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1016is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1017single traverse. We store a mapping from accepting to state to which word was
1018matched, and then when we have multiple possibilities we try to complete the
1019rest of the regex in the order in which they occured in the alternation.
1020
1021The only prior NFA like behaviour that would be changed by the TRIE support is
1022the silent ignoring of duplicate alternations which are of the form:
1023
1024 / (DUPE|DUPE) X? (?{ ... }) Y /x
1025
1026Thus EVAL blocks follwing a trie may be called a different number of times with
1027and without the optimisation. With the optimisations dupes will be silently
1028ignored. This inconsistant behaviour of EVAL type nodes is well established as
1029the following demonstrates:
1030
1031 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1032
1033which prints out 'word' three times, but
1034
1035 'words'=~/(word|word|word)(?{ print $1 })S/
1036
1037which doesnt print it out at all. This is due to other optimisations kicking in.
1038
1039Example of what happens on a structural level:
1040
1041The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1042
1043 1: CURLYM[1] {1,32767}(18)
1044 5: BRANCH(8)
1045 6: EXACT <ac>(16)
1046 8: BRANCH(11)
1047 9: EXACT <ad>(16)
1048 11: BRANCH(14)
1049 12: EXACT <ab>(16)
1050 16: SUCCEED(0)
1051 17: NOTHING(18)
1052 18: END(0)
1053
1054This would be optimizable with startbranch=5, first=5, last=16, tail=16
1055and should turn into:
1056
1057 1: CURLYM[1] {1,32767}(18)
1058 5: TRIE(16)
1059 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1060 <ac>
1061 <ad>
1062 <ab>
1063 16: SUCCEED(0)
1064 17: NOTHING(18)
1065 18: END(0)
1066
1067Cases where tail != last would be like /(?foo|bar)baz/:
1068
1069 1: BRANCH(4)
1070 2: EXACT <foo>(8)
1071 4: BRANCH(7)
1072 5: EXACT <bar>(8)
1073 7: TAIL(8)
1074 8: EXACT <baz>(10)
1075 10: END(0)
1076
1077which would be optimizable with startbranch=1, first=1, last=7, tail=8
1078and would end up looking like:
1079
1080 1: TRIE(8)
1081 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1082 <foo>
1083 <bar>
1084 7: TAIL(8)
1085 8: EXACT <baz>(10)
1086 10: END(0)
1087
1088 d = uvuni_to_utf8_flags(d, uv, 0);
1089
1090is the recommended Unicode-aware way of saying
1091
1092 *(d++) = uv;
1093*/
1094
1095#define TRIE_STORE_REVCHAR \
1096 STMT_START { \
1097 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1098 if (UTF) SvUTF8_on(tmp); \
1099 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1100 } STMT_END
1101
1102#define TRIE_READ_CHAR STMT_START { \
1103 wordlen++; \
1104 if ( UTF ) { \
1105 if ( folder ) { \
1106 if ( foldlen > 0 ) { \
1107 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1108 foldlen -= len; \
1109 scan += len; \
1110 len = 0; \
1111 } else { \
1112 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1113 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1114 foldlen -= UNISKIP( uvc ); \
1115 scan = foldbuf + UNISKIP( uvc ); \
1116 } \
1117 } else { \
1118 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1119 } \
1120 } else { \
1121 uvc = (U32)*uc; \
1122 len = 1; \
1123 } \
1124} STMT_END
1125
1126
1127
1128#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1129 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1130 TRIE_LIST_LEN( state ) *= 2; \
1131 Renew( trie->states[ state ].trans.list, \
1132 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1133 } \
1134 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1135 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1136 TRIE_LIST_CUR( state )++; \
1137} STMT_END
07be1b83 1138
786e8c11
YO
1139#define TRIE_LIST_NEW(state) STMT_START { \
1140 Newxz( trie->states[ state ].trans.list, \
1141 4, reg_trie_trans_le ); \
1142 TRIE_LIST_CUR( state ) = 1; \
1143 TRIE_LIST_LEN( state ) = 4; \
1144} STMT_END
07be1b83 1145
786e8c11
YO
1146#define TRIE_HANDLE_WORD(state) STMT_START { \
1147 U16 dupe= trie->states[ state ].wordnum; \
1148 regnode * const noper_next = regnext( noper ); \
1149 \
1150 if (trie->wordlen) \
1151 trie->wordlen[ curword ] = wordlen; \
1152 DEBUG_r({ \
1153 /* store the word for dumping */ \
1154 SV* tmp; \
1155 if (OP(noper) != NOTHING) \
1156 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1157 else \
1158 tmp = newSVpvn( "", 0 ); \
1159 if ( UTF ) SvUTF8_on( tmp ); \
1160 av_push( trie->words, tmp ); \
1161 }); \
1162 \
1163 curword++; \
1164 \
1165 if ( noper_next < tail ) { \
1166 if (!trie->jump) \
1167 Newxz( trie->jump, word_count + 1, U16); \
1168 trie->jump[curword] = (U16)(tail - noper_next); \
1169 if (!jumper) \
1170 jumper = noper_next; \
1171 if (!nextbranch) \
1172 nextbranch= regnext(cur); \
1173 } \
1174 \
1175 if ( dupe ) { \
1176 /* So it's a dupe. This means we need to maintain a */\
1177 /* linked-list from the first to the next. */\
1178 /* we only allocate the nextword buffer when there */\
1179 /* a dupe, so first time we have to do the allocation */\
1180 if (!trie->nextword) \
1181 Newxz( trie->nextword, word_count + 1, U16); \
1182 while ( trie->nextword[dupe] ) \
1183 dupe= trie->nextword[dupe]; \
1184 trie->nextword[dupe]= curword; \
1185 } else { \
1186 /* we haven't inserted this word yet. */ \
1187 trie->states[ state ].wordnum = curword; \
1188 } \
1189} STMT_END
07be1b83 1190
3dab1dad 1191
786e8c11
YO
1192#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1193 ( ( base + charid >= ucharcount \
1194 && base + charid < ubound \
1195 && state == trie->trans[ base - ucharcount + charid ].check \
1196 && trie->trans[ base - ucharcount + charid ].next ) \
1197 ? trie->trans[ base - ucharcount + charid ].next \
1198 : ( state==1 ? special : 0 ) \
1199 )
3dab1dad 1200
786e8c11
YO
1201#define MADE_TRIE 1
1202#define MADE_JUMP_TRIE 2
1203#define MADE_EXACT_TRIE 4
3dab1dad 1204
a3621e74 1205STATIC I32
786e8c11 1206S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1207{
27da23d5 1208 dVAR;
a3621e74
YO
1209 /* first pass, loop through and scan words */
1210 reg_trie_data *trie;
1211 regnode *cur;
9f7f3913 1212 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1213 STRLEN len = 0;
1214 UV uvc = 0;
1215 U16 curword = 0;
1216 U32 next_alloc = 0;
786e8c11
YO
1217 regnode *jumper = NULL;
1218 regnode *nextbranch = NULL;
a3621e74 1219 /* we just use folder as a flag in utf8 */
e1ec3a88 1220 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1221 ? PL_fold
1222 : ( flags == EXACTFL
1223 ? PL_fold_locale
1224 : NULL
1225 )
1226 );
1227
e1ec3a88 1228 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1229 SV *re_trie_maxbuff;
3dab1dad
YO
1230#ifndef DEBUGGING
1231 /* these are only used during construction but are useful during
8e11feef 1232 * debugging so we store them in the struct when debugging.
8e11feef 1233 */
3dab1dad 1234 STRLEN trie_charcount=0;
3dab1dad
YO
1235 AV *trie_revcharmap;
1236#endif
a3621e74 1237 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1238#ifndef DEBUGGING
1239 PERL_UNUSED_ARG(depth);
1240#endif
a3621e74 1241
a02a5408 1242 Newxz( trie, 1, reg_trie_data );
a3621e74 1243 trie->refcount = 1;
3dab1dad 1244 trie->startstate = 1;
786e8c11 1245 trie->wordcount = word_count;
a3621e74 1246 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1247 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1248 if (!(UTF && folder))
1249 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1250 DEBUG_r({
1251 trie->words = newAV();
a3621e74 1252 });
3dab1dad 1253 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1254
0111c4fd 1255 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1256 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1257 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1258 }
3dab1dad
YO
1259 DEBUG_OPTIMISE_r({
1260 PerlIO_printf( Perl_debug_log,
786e8c11 1261 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1262 (int)depth * 2 + 2, "",
1263 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1264 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1265 (int)depth);
3dab1dad 1266 });
a3621e74
YO
1267 /* -- First loop and Setup --
1268
1269 We first traverse the branches and scan each word to determine if it
1270 contains widechars, and how many unique chars there are, this is
1271 important as we have to build a table with at least as many columns as we
1272 have unique chars.
1273
1274 We use an array of integers to represent the character codes 0..255
1275 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1276 native representation of the character value as the key and IV's for the
1277 coded index.
1278
1279 *TODO* If we keep track of how many times each character is used we can
1280 remap the columns so that the table compression later on is more
1281 efficient in terms of memory by ensuring most common value is in the
1282 middle and the least common are on the outside. IMO this would be better
1283 than a most to least common mapping as theres a decent chance the most
1284 common letter will share a node with the least common, meaning the node
1285 will not be compressable. With a middle is most common approach the worst
1286 case is when we have the least common nodes twice.
1287
1288 */
1289
a3621e74 1290 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1291 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1292 const U8 *uc = (U8*)STRING( noper );
a28509cc 1293 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1294 STRLEN foldlen = 0;
1295 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1296 const U8 *scan = (U8*)NULL;
07be1b83 1297 U32 wordlen = 0; /* required init */
3dab1dad 1298 STRLEN chars=0;
a3621e74 1299
3dab1dad
YO
1300 if (OP(noper) == NOTHING) {
1301 trie->minlen= 0;
1302 continue;
1303 }
1304 if (trie->bitmap) {
1305 TRIE_BITMAP_SET(trie,*uc);
1306 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1307 }
a3621e74 1308 for ( ; uc < e ; uc += len ) {
3dab1dad 1309 TRIE_CHARCOUNT(trie)++;
a3621e74 1310 TRIE_READ_CHAR;
3dab1dad 1311 chars++;
a3621e74
YO
1312 if ( uvc < 256 ) {
1313 if ( !trie->charmap[ uvc ] ) {
1314 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1315 if ( folder )
1316 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1317 TRIE_STORE_REVCHAR;
a3621e74
YO
1318 }
1319 } else {
1320 SV** svpp;
1321 if ( !trie->widecharmap )
1322 trie->widecharmap = newHV();
1323
1324 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1325
1326 if ( !svpp )
e4584336 1327 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1328
1329 if ( !SvTRUE( *svpp ) ) {
1330 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1331 TRIE_STORE_REVCHAR;
a3621e74
YO
1332 }
1333 }
1334 }
3dab1dad
YO
1335 if( cur == first ) {
1336 trie->minlen=chars;
1337 trie->maxlen=chars;
1338 } else if (chars < trie->minlen) {
1339 trie->minlen=chars;
1340 } else if (chars > trie->maxlen) {
1341 trie->maxlen=chars;
1342 }
1343
a3621e74
YO
1344 } /* end first pass */
1345 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1346 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1347 (int)depth * 2 + 2,"",
85c3142d 1348 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1349 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1350 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1351 );
786e8c11 1352 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1353
1354 /*
1355 We now know what we are dealing with in terms of unique chars and
1356 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1357 representation using a flat table will take. If it's over a reasonable
1358 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1359 conservative but potentially much slower representation using an array
1360 of lists.
1361
1362 At the end we convert both representations into the same compressed
1363 form that will be used in regexec.c for matching with. The latter
1364 is a form that cannot be used to construct with but has memory
1365 properties similar to the list form and access properties similar
1366 to the table form making it both suitable for fast searches and
1367 small enough that its feasable to store for the duration of a program.
1368
1369 See the comment in the code where the compressed table is produced
1370 inplace from the flat tabe representation for an explanation of how
1371 the compression works.
1372
1373 */
1374
1375
3dab1dad 1376 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1377 /*
1378 Second Pass -- Array Of Lists Representation
1379
1380 Each state will be represented by a list of charid:state records
1381 (reg_trie_trans_le) the first such element holds the CUR and LEN
1382 points of the allocated array. (See defines above).
1383
1384 We build the initial structure using the lists, and then convert
1385 it into the compressed table form which allows faster lookups
1386 (but cant be modified once converted).
a3621e74
YO
1387 */
1388
a3621e74
YO
1389 STRLEN transcount = 1;
1390
3dab1dad 1391 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1392 TRIE_LIST_NEW(1);
1393 next_alloc = 2;
1394
1395 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1396
c445ea15
AL
1397 regnode * const noper = NEXTOPER( cur );
1398 U8 *uc = (U8*)STRING( noper );
1399 const U8 * const e = uc + STR_LEN( noper );
1400 U32 state = 1; /* required init */
1401 U16 charid = 0; /* sanity init */
1402 U8 *scan = (U8*)NULL; /* sanity init */
1403 STRLEN foldlen = 0; /* required init */
07be1b83 1404 U32 wordlen = 0; /* required init */
c445ea15
AL
1405 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1406
3dab1dad 1407 if (OP(noper) != NOTHING) {
786e8c11 1408 for ( ; uc < e ; uc += len ) {
c445ea15 1409
786e8c11 1410 TRIE_READ_CHAR;
c445ea15 1411
786e8c11
YO
1412 if ( uvc < 256 ) {
1413 charid = trie->charmap[ uvc ];
c445ea15 1414 } else {
786e8c11
YO
1415 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1416 if ( !svpp ) {
1417 charid = 0;
1418 } else {
1419 charid=(U16)SvIV( *svpp );
1420 }
c445ea15 1421 }
786e8c11
YO
1422 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1423 if ( charid ) {
a3621e74 1424
786e8c11
YO
1425 U16 check;
1426 U32 newstate = 0;
a3621e74 1427
786e8c11
YO
1428 charid--;
1429 if ( !trie->states[ state ].trans.list ) {
1430 TRIE_LIST_NEW( state );
c445ea15 1431 }
786e8c11
YO
1432 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1433 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1434 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1435 break;
1436 }
1437 }
1438 if ( ! newstate ) {
1439 newstate = next_alloc++;
1440 TRIE_LIST_PUSH( state, charid, newstate );
1441 transcount++;
1442 }
1443 state = newstate;
1444 } else {
1445 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1446 }
a28509cc 1447 }
c445ea15 1448 }
3dab1dad 1449 TRIE_HANDLE_WORD(state);
a3621e74
YO
1450
1451 } /* end second pass */
1452
786e8c11 1453 trie->laststate = next_alloc;
a3621e74
YO
1454 Renew( trie->states, next_alloc, reg_trie_state );
1455
3dab1dad
YO
1456 /* and now dump it out before we compress it */
1457 DEBUG_TRIE_COMPILE_MORE_r(
1458 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1459 );
a3621e74 1460
a02a5408 1461 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1462 {
1463 U32 state;
a3621e74
YO
1464 U32 tp = 0;
1465 U32 zp = 0;
1466
1467
1468 for( state=1 ; state < next_alloc ; state ++ ) {
1469 U32 base=0;
1470
1471 /*
1472 DEBUG_TRIE_COMPILE_MORE_r(
1473 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1474 );
1475 */
1476
1477 if (trie->states[state].trans.list) {
1478 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1479 U16 maxid=minid;
a28509cc 1480 U16 idx;
a3621e74
YO
1481
1482 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1483 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1484 if ( forid < minid ) {
1485 minid=forid;
1486 } else if ( forid > maxid ) {
1487 maxid=forid;
1488 }
a3621e74
YO
1489 }
1490 if ( transcount < tp + maxid - minid + 1) {
1491 transcount *= 2;
1492 Renew( trie->trans, transcount, reg_trie_trans );
1493 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1494 }
1495 base = trie->uniquecharcount + tp - minid;
1496 if ( maxid == minid ) {
1497 U32 set = 0;
1498 for ( ; zp < tp ; zp++ ) {
1499 if ( ! trie->trans[ zp ].next ) {
1500 base = trie->uniquecharcount + zp - minid;
1501 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1502 trie->trans[ zp ].check = state;
1503 set = 1;
1504 break;
1505 }
1506 }
1507 if ( !set ) {
1508 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1509 trie->trans[ tp ].check = state;
1510 tp++;
1511 zp = tp;
1512 }
1513 } else {
1514 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1515 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1516 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1517 trie->trans[ tid ].check = state;
1518 }
1519 tp += ( maxid - minid + 1 );
1520 }
1521 Safefree(trie->states[ state ].trans.list);
1522 }
1523 /*
1524 DEBUG_TRIE_COMPILE_MORE_r(
1525 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1526 );
1527 */
1528 trie->states[ state ].trans.base=base;
1529 }
cc601c31 1530 trie->lasttrans = tp + 1;
a3621e74
YO
1531 }
1532 } else {
1533 /*
1534 Second Pass -- Flat Table Representation.
1535
1536 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1537 We know that we will need Charcount+1 trans at most to store the data
1538 (one row per char at worst case) So we preallocate both structures
1539 assuming worst case.
1540
1541 We then construct the trie using only the .next slots of the entry
1542 structs.
1543
1544 We use the .check field of the first entry of the node temporarily to
1545 make compression both faster and easier by keeping track of how many non
1546 zero fields are in the node.
1547
1548 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1549 transition.
1550
1551 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1552 number representing the first entry of the node, and state as a
1553 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1554 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1555 are 2 entrys per node. eg:
1556
1557 A B A B
1558 1. 2 4 1. 3 7
1559 2. 0 3 3. 0 5
1560 3. 0 0 5. 0 0
1561 4. 0 0 7. 0 0
1562
1563 The table is internally in the right hand, idx form. However as we also
1564 have to deal with the states array which is indexed by nodenum we have to
1565 use TRIE_NODENUM() to convert.
1566
1567 */
1568
3dab1dad
YO
1569
1570 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1571 reg_trie_trans );
3dab1dad 1572 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1573 next_alloc = trie->uniquecharcount + 1;
1574
3dab1dad 1575
a3621e74
YO
1576 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1577
c445ea15 1578 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1579 const U8 *uc = (U8*)STRING( noper );
1580 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1581
1582 U32 state = 1; /* required init */
1583
1584 U16 charid = 0; /* sanity init */
1585 U32 accept_state = 0; /* sanity init */
1586 U8 *scan = (U8*)NULL; /* sanity init */
1587
1588 STRLEN foldlen = 0; /* required init */
07be1b83 1589 U32 wordlen = 0; /* required init */
a3621e74
YO
1590 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1591
3dab1dad 1592 if ( OP(noper) != NOTHING ) {
786e8c11 1593 for ( ; uc < e ; uc += len ) {
a3621e74 1594
786e8c11 1595 TRIE_READ_CHAR;
a3621e74 1596
786e8c11
YO
1597 if ( uvc < 256 ) {
1598 charid = trie->charmap[ uvc ];
1599 } else {
1600 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1601 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1602 }
786e8c11
YO
1603 if ( charid ) {
1604 charid--;
1605 if ( !trie->trans[ state + charid ].next ) {
1606 trie->trans[ state + charid ].next = next_alloc;
1607 trie->trans[ state ].check++;
1608 next_alloc += trie->uniquecharcount;
1609 }
1610 state = trie->trans[ state + charid ].next;
1611 } else {
1612 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1613 }
1614 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1615 }
a3621e74 1616 }
3dab1dad
YO
1617 accept_state = TRIE_NODENUM( state );
1618 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1619
1620 } /* end second pass */
1621
3dab1dad
YO
1622 /* and now dump it out before we compress it */
1623 DEBUG_TRIE_COMPILE_MORE_r(
1624 dump_trie_interim_table(trie,next_alloc,depth+1)
1625 );
a3621e74 1626
a3621e74
YO
1627 {
1628 /*
1629 * Inplace compress the table.*
1630
1631 For sparse data sets the table constructed by the trie algorithm will
1632 be mostly 0/FAIL transitions or to put it another way mostly empty.
1633 (Note that leaf nodes will not contain any transitions.)
1634
1635 This algorithm compresses the tables by eliminating most such
1636 transitions, at the cost of a modest bit of extra work during lookup:
1637
1638 - Each states[] entry contains a .base field which indicates the
1639 index in the state[] array wheres its transition data is stored.
1640
1641 - If .base is 0 there are no valid transitions from that node.
1642
1643 - If .base is nonzero then charid is added to it to find an entry in
1644 the trans array.
1645
1646 -If trans[states[state].base+charid].check!=state then the
1647 transition is taken to be a 0/Fail transition. Thus if there are fail
1648 transitions at the front of the node then the .base offset will point
1649 somewhere inside the previous nodes data (or maybe even into a node
1650 even earlier), but the .check field determines if the transition is
1651 valid.
1652
786e8c11 1653 XXX - wrong maybe?
a3621e74
YO
1654 The following process inplace converts the table to the compressed
1655 table: We first do not compress the root node 1,and mark its all its
1656 .check pointers as 1 and set its .base pointer as 1 as well. This
1657 allows to do a DFA construction from the compressed table later, and
1658 ensures that any .base pointers we calculate later are greater than
1659 0.
1660
1661 - We set 'pos' to indicate the first entry of the second node.
1662
1663 - We then iterate over the columns of the node, finding the first and
1664 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1665 and set the .check pointers accordingly, and advance pos
1666 appropriately and repreat for the next node. Note that when we copy
1667 the next pointers we have to convert them from the original
1668 NODEIDX form to NODENUM form as the former is not valid post
1669 compression.
1670
1671 - If a node has no transitions used we mark its base as 0 and do not
1672 advance the pos pointer.
1673
1674 - If a node only has one transition we use a second pointer into the
1675 structure to fill in allocated fail transitions from other states.
1676 This pointer is independent of the main pointer and scans forward
1677 looking for null transitions that are allocated to a state. When it
1678 finds one it writes the single transition into the "hole". If the
786e8c11 1679 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1680
1681 - Once compressed we can Renew/realloc the structures to release the
1682 excess space.
1683
1684 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1685 specifically Fig 3.47 and the associated pseudocode.
1686
1687 demq
1688 */
a3b680e6 1689 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1690 U32 state, charid;
a3621e74 1691 U32 pos = 0, zp=0;
786e8c11 1692 trie->laststate = laststate;
a3621e74
YO
1693
1694 for ( state = 1 ; state < laststate ; state++ ) {
1695 U8 flag = 0;
a28509cc
AL
1696 const U32 stateidx = TRIE_NODEIDX( state );
1697 const U32 o_used = trie->trans[ stateidx ].check;
1698 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1699 trie->trans[ stateidx ].check = 0;
1700
1701 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1702 if ( flag || trie->trans[ stateidx + charid ].next ) {
1703 if ( trie->trans[ stateidx + charid ].next ) {
1704 if (o_used == 1) {
1705 for ( ; zp < pos ; zp++ ) {
1706 if ( ! trie->trans[ zp ].next ) {
1707 break;
1708 }
1709 }
1710 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1711 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1712 trie->trans[ zp ].check = state;
1713 if ( ++zp > pos ) pos = zp;
1714 break;
1715 }
1716 used--;
1717 }
1718 if ( !flag ) {
1719 flag = 1;
1720 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1721 }
1722 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1723 trie->trans[ pos ].check = state;
1724 pos++;
1725 }
1726 }
1727 }
cc601c31 1728 trie->lasttrans = pos + 1;
a3621e74
YO
1729 Renew( trie->states, laststate + 1, reg_trie_state);
1730 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1731 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1732 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1733 (int)depth * 2 + 2,"",
1734 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1735 (IV)next_alloc,
1736 (IV)pos,
a3621e74
YO
1737 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1738 );
1739
1740 } /* end table compress */
1741 }
cc601c31
YO
1742 /* resize the trans array to remove unused space */
1743 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1744
3dab1dad
YO
1745 /* and now dump out the compressed format */
1746 DEBUG_TRIE_COMPILE_r(
1747 dump_trie(trie,depth+1)
1748 );
07be1b83 1749
3dab1dad
YO
1750 { /* Modify the program and insert the new TRIE node*/
1751 regnode *convert;
1752 U8 nodetype =(U8)(flags & 0xFF);
1753 char *str=NULL;
786e8c11 1754
07be1b83 1755#ifdef DEBUGGING
786e8c11 1756
07be1b83
YO
1757 U32 mjd_offset;
1758 U32 mjd_nodelen;
1759#endif
a3621e74 1760 /*
3dab1dad
YO
1761 This means we convert either the first branch or the first Exact,
1762 depending on whether the thing following (in 'last') is a branch
1763 or not and whther first is the startbranch (ie is it a sub part of
1764 the alternation or is it the whole thing.)
1765 Assuming its a sub part we conver the EXACT otherwise we convert
1766 the whole branch sequence, including the first.
a3621e74 1767 */
3dab1dad
YO
1768 /* Find the node we are going to overwrite */
1769 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1770 /* whole branch chain */
3dab1dad 1771 convert = first;
07be1b83
YO
1772 DEBUG_r({
1773 const regnode *nop = NEXTOPER( convert );
1774 mjd_offset= Node_Offset((nop));
1775 mjd_nodelen= Node_Length((nop));
1776 });
1777 } else {
1778 /* branch sub-chain */
3dab1dad
YO
1779 convert = NEXTOPER( first );
1780 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1781 DEBUG_r({
1782 mjd_offset= Node_Offset((convert));
1783 mjd_nodelen= Node_Length((convert));
1784 });
1785 }
1786 DEBUG_OPTIMISE_r(
1787 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1788 (int)depth * 2 + 2, "",
786e8c11 1789 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1790 );
a3621e74 1791
3dab1dad
YO
1792 /* But first we check to see if there is a common prefix we can
1793 split out as an EXACT and put in front of the TRIE node. */
1794 trie->startstate= 1;
786e8c11 1795 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1796 U32 state;
1797 DEBUG_OPTIMISE_r(
8e11feef
RGS
1798 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1799 (int)depth * 2 + 2, "",
786e8c11 1800 (UV)trie->laststate)
8e11feef 1801 );
786e8c11 1802 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1803 U32 ofs = 0;
8e11feef
RGS
1804 I32 idx = -1;
1805 U32 count = 0;
1806 const U32 base = trie->states[ state ].trans.base;
a3621e74 1807
3dab1dad 1808 if ( trie->states[state].wordnum )
8e11feef 1809 count = 1;
a3621e74 1810
8e11feef 1811 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1812 if ( ( base + ofs >= trie->uniquecharcount ) &&
1813 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1814 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1815 {
3dab1dad 1816 if ( ++count > 1 ) {
8e11feef 1817 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1818 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1819 if ( state == 1 ) break;
3dab1dad
YO
1820 if ( count == 2 ) {
1821 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1822 DEBUG_OPTIMISE_r(
8e11feef
RGS
1823 PerlIO_printf(Perl_debug_log,
1824 "%*sNew Start State=%"UVuf" Class: [",
1825 (int)depth * 2 + 2, "",
786e8c11 1826 (UV)state));
be8e71aa
YO
1827 if (idx >= 0) {
1828 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1829 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1830
3dab1dad 1831 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1832 if ( folder )
1833 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1834 DEBUG_OPTIMISE_r(
07be1b83 1835 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1836 );
8e11feef
RGS
1837 }
1838 }
1839 TRIE_BITMAP_SET(trie,*ch);
1840 if ( folder )
1841 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1842 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1843 }
1844 idx = ofs;
1845 }
3dab1dad
YO
1846 }
1847 if ( count == 1 ) {
1848 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1849 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1850 DEBUG_OPTIMISE_r(
8e11feef
RGS
1851 PerlIO_printf( Perl_debug_log,
1852 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1853 (int)depth * 2 + 2, "",
786e8c11 1854 (UV)state, (UV)idx, ch)
3dab1dad
YO
1855 );
1856 if ( state==1 ) {
1857 OP( convert ) = nodetype;
1858 str=STRING(convert);
1859 STR_LEN(convert)=0;
1860 }
1861 *str++=*ch;
1862 STR_LEN(convert)++;
a3621e74 1863
8e11feef 1864 } else {
f9049ba1 1865#ifdef DEBUGGING
8e11feef
RGS
1866 if (state>1)
1867 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1868#endif
8e11feef
RGS
1869 break;
1870 }
1871 }
3dab1dad 1872 if (str) {
8e11feef 1873 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1874 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1875 trie->startstate = state;
07be1b83
YO
1876 trie->minlen -= (state - 1);
1877 trie->maxlen -= (state - 1);
1878 DEBUG_r({
1879 regnode *fix = convert;
1880 mjd_nodelen++;
1881 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1882 while( ++fix < n ) {
1883 Set_Node_Offset_Length(fix, 0, 0);
1884 }
1885 });
8e11feef
RGS
1886 if (trie->maxlen) {
1887 convert = n;
1888 } else {
3dab1dad
YO
1889 NEXT_OFF(convert) = (U16)(tail - convert);
1890 }
1891 }
1892 }
1893 if ( trie->maxlen ) {
8e11feef
RGS
1894 NEXT_OFF( convert ) = (U16)(tail - convert);
1895 ARG_SET( convert, data_slot );
786e8c11
YO
1896 /* Store the offset to the first unabsorbed branch in
1897 jump[0], which is otherwise unused by the jump logic.
1898 We use this when dumping a trie and during optimisation. */
1899 if (trie->jump)
1900 trie->jump[0] = (U16)(tail - nextbranch);
1901 if (!jumper)
85c3142d 1902 jumper = last;
786e8c11
YO
1903 /* XXXX */
1904 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1905 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1906 {
1907 OP( convert ) = TRIEC;
1908 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1909 Safefree(trie->bitmap);
1910 trie->bitmap= NULL;
1911 } else
1912 OP( convert ) = TRIE;
a3621e74 1913
3dab1dad
YO
1914 /* store the type in the flags */
1915 convert->flags = nodetype;
1916 /* XXX We really should free up the resource in trie now, as we wont use them */
1917 }
a3621e74
YO
1918 /* needed for dumping*/
1919 DEBUG_r({
786e8c11
YO
1920 regnode *optimize = convert
1921 + NODE_STEP_REGNODE
1922 + regarglen[ OP( convert ) ];
07be1b83
YO
1923 regnode *opt = convert;
1924 while (++opt<optimize) {
1925 Set_Node_Offset_Length(opt,0,0);
1926 }
786e8c11
YO
1927 /*
1928 Try to clean up some of the debris left after the
1929 optimisation.
a3621e74 1930 */
786e8c11 1931 while( optimize < jumper ) {
07be1b83 1932 mjd_nodelen += Node_Length((optimize));
a3621e74 1933 OP( optimize ) = OPTIMIZED;
07be1b83 1934 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1935 optimize++;
1936 }
07be1b83 1937 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1938 });
1939 } /* end node insert */
07be1b83 1940#ifndef DEBUGGING
6e8b4190 1941 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1942#endif
786e8c11
YO
1943 return trie->jump
1944 ? MADE_JUMP_TRIE
1945 : trie->startstate>1
1946 ? MADE_EXACT_TRIE
1947 : MADE_TRIE;
1948}
1949
1950STATIC void
1951S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1952{
1953/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1954
1955 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1956 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1957 ISBN 0-201-10088-6
1958
1959 We find the fail state for each state in the trie, this state is the longest proper
1960 suffix of the current states 'word' that is also a proper prefix of another word in our
1961 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1962 the DFA not to have to restart after its tried and failed a word at a given point, it
1963 simply continues as though it had been matching the other word in the first place.
1964 Consider
1965 'abcdgu'=~/abcdefg|cdgu/
1966 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1967 fail, which would bring use to the state representing 'd' in the second word where we would
1968 try 'g' and succeed, prodceding to match 'cdgu'.
1969 */
1970 /* add a fail transition */
1971 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1972 U32 *q;
1973 const U32 ucharcount = trie->uniquecharcount;
1974 const U32 numstates = trie->laststate;
1975 const U32 ubound = trie->lasttrans + ucharcount;
1976 U32 q_read = 0;
1977 U32 q_write = 0;
1978 U32 charid;
1979 U32 base = trie->states[ 1 ].trans.base;
1980 U32 *fail;
1981 reg_ac_data *aho;
1982 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1983 GET_RE_DEBUG_FLAGS_DECL;
1984#ifndef DEBUGGING
1985 PERL_UNUSED_ARG(depth);
1986#endif
1987
1988
1989 ARG_SET( stclass, data_slot );
1990 Newxz( aho, 1, reg_ac_data );
1991 RExC_rx->data->data[ data_slot ] = (void*)aho;
1992 aho->trie=trie;
1993 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1994 (trie->laststate+1)*sizeof(reg_trie_state));
1995 Newxz( q, numstates, U32);
1996 Newxz( aho->fail, numstates, U32 );
1997 aho->refcount = 1;
1998 fail = aho->fail;
1999 /* initialize fail[0..1] to be 1 so that we always have
2000 a valid final fail state */
2001 fail[ 0 ] = fail[ 1 ] = 1;
2002
2003 for ( charid = 0; charid < ucharcount ; charid++ ) {
2004 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2005 if ( newstate ) {
2006 q[ q_write ] = newstate;
2007 /* set to point at the root */
2008 fail[ q[ q_write++ ] ]=1;
2009 }
2010 }
2011 while ( q_read < q_write) {
2012 const U32 cur = q[ q_read++ % numstates ];
2013 base = trie->states[ cur ].trans.base;
2014
2015 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2016 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2017 if (ch_state) {
2018 U32 fail_state = cur;
2019 U32 fail_base;
2020 do {
2021 fail_state = fail[ fail_state ];
2022 fail_base = aho->states[ fail_state ].trans.base;
2023 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2024
2025 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2026 fail[ ch_state ] = fail_state;
2027 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2028 {
2029 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2030 }
2031 q[ q_write++ % numstates] = ch_state;
2032 }
2033 }
2034 }
2035 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2036 when we fail in state 1, this allows us to use the
2037 charclass scan to find a valid start char. This is based on the principle
2038 that theres a good chance the string being searched contains lots of stuff
2039 that cant be a start char.
2040 */
2041 fail[ 0 ] = fail[ 1 ] = 0;
2042 DEBUG_TRIE_COMPILE_r({
2043 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2044 for( q_read=1; q_read<numstates; q_read++ ) {
2045 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2046 }
2047 PerlIO_printf(Perl_debug_log, "\n");
2048 });
2049 Safefree(q);
2050 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2051}
2052
786e8c11 2053
a3621e74 2054/*
5d1c421c
JH
2055 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2056 * These need to be revisited when a newer toolchain becomes available.
2057 */
2058#if defined(__sparc64__) && defined(__GNUC__)
2059# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2060# undef SPARC64_GCC_WORKAROUND
2061# define SPARC64_GCC_WORKAROUND 1
2062# endif
2063#endif
2064
07be1b83
YO
2065#define DEBUG_PEEP(str,scan,depth) \
2066 DEBUG_OPTIMISE_r({ \
2067 SV * const mysv=sv_newmortal(); \
2068 regnode *Next = regnext(scan); \
2069 regprop(RExC_rx, mysv, scan); \
2070 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2071 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2072 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2073 });
2074
1de06328
YO
2075
2076
2077
2078
07be1b83
YO
2079#define JOIN_EXACT(scan,min,flags) \
2080 if (PL_regkind[OP(scan)] == EXACT) \
2081 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2082
be8e71aa 2083STATIC U32
07be1b83
YO
2084S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2085 /* Merge several consecutive EXACTish nodes into one. */
2086 regnode *n = regnext(scan);
2087 U32 stringok = 1;
2088 regnode *next = scan + NODE_SZ_STR(scan);
2089 U32 merged = 0;
2090 U32 stopnow = 0;
2091#ifdef DEBUGGING
2092 regnode *stop = scan;
72f13be8 2093 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2094#else
d47053eb
RGS
2095 PERL_UNUSED_ARG(depth);
2096#endif
2097#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2098 PERL_UNUSED_ARG(flags);
2099 PERL_UNUSED_ARG(val);
07be1b83 2100#endif
07be1b83
YO
2101 DEBUG_PEEP("join",scan,depth);
2102
2103 /* Skip NOTHING, merge EXACT*. */
2104 while (n &&
2105 ( PL_regkind[OP(n)] == NOTHING ||
2106 (stringok && (OP(n) == OP(scan))))
2107 && NEXT_OFF(n)
2108 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2109
2110 if (OP(n) == TAIL || n > next)
2111 stringok = 0;
2112 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2113 DEBUG_PEEP("skip:",n,depth);
2114 NEXT_OFF(scan) += NEXT_OFF(n);
2115 next = n + NODE_STEP_REGNODE;
2116#ifdef DEBUGGING
2117 if (stringok)
2118 stop = n;
2119#endif
2120 n = regnext(n);
2121 }
2122 else if (stringok) {
786e8c11 2123 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2124 regnode * const nnext = regnext(n);
2125
2126 DEBUG_PEEP("merg",n,depth);
2127
2128 merged++;
2129 if (oldl + STR_LEN(n) > U8_MAX)
2130 break;
2131 NEXT_OFF(scan) += NEXT_OFF(n);
2132 STR_LEN(scan) += STR_LEN(n);
2133 next = n + NODE_SZ_STR(n);
2134 /* Now we can overwrite *n : */
2135 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2136#ifdef DEBUGGING
2137 stop = next - 1;
2138#endif
2139 n = nnext;
2140 if (stopnow) break;
2141 }
2142
d47053eb
RGS
2143#ifdef EXPERIMENTAL_INPLACESCAN
2144 if (flags && !NEXT_OFF(n)) {
2145 DEBUG_PEEP("atch", val, depth);
2146 if (reg_off_by_arg[OP(n)]) {
2147 ARG_SET(n, val - n);
2148 }
2149 else {
2150 NEXT_OFF(n) = val - n;
2151 }
2152 stopnow = 1;
2153 }
07be1b83
YO
2154#endif
2155 }
2156
2157 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2158 /*
2159 Two problematic code points in Unicode casefolding of EXACT nodes:
2160
2161 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2162 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2163
2164 which casefold to
2165
2166 Unicode UTF-8
2167
2168 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2169 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2170
2171 This means that in case-insensitive matching (or "loose matching",
2172 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2173 length of the above casefolded versions) can match a target string
2174 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2175 This would rather mess up the minimum length computation.
2176
2177 What we'll do is to look for the tail four bytes, and then peek
2178 at the preceding two bytes to see whether we need to decrease
2179 the minimum length by four (six minus two).
2180
2181 Thanks to the design of UTF-8, there cannot be false matches:
2182 A sequence of valid UTF-8 bytes cannot be a subsequence of
2183 another valid sequence of UTF-8 bytes.
2184
2185 */
2186 char * const s0 = STRING(scan), *s, *t;
2187 char * const s1 = s0 + STR_LEN(scan) - 1;
2188 char * const s2 = s1 - 4;
e294cc5d
JH
2189#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2190 const char t0[] = "\xaf\x49\xaf\x42";
2191#else
07be1b83 2192 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2193#endif
07be1b83
YO
2194 const char * const t1 = t0 + 3;
2195
2196 for (s = s0 + 2;
2197 s < s2 && (t = ninstr(s, s1, t0, t1));
2198 s = t + 4) {
e294cc5d
JH
2199#ifdef EBCDIC
2200 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2201 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2202#else
07be1b83
YO
2203 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2204 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2205#endif
07be1b83
YO
2206 *min -= 4;
2207 }
2208 }
2209
2210#ifdef DEBUGGING
2211 /* Allow dumping */
2212 n = scan + NODE_SZ_STR(scan);
2213 while (n <= stop) {
2214 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2215 OP(n) = OPTIMIZED;
2216 NEXT_OFF(n) = 0;
2217 }
2218 n++;
2219 }
2220#endif
2221 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2222 return stopnow;
2223}
2224
653099ff
GS
2225/* REx optimizer. Converts nodes into quickier variants "in place".
2226 Finds fixed substrings. */
2227
a0288114 2228/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2229 to the position after last scanned or to NULL. */
2230
07be1b83
YO
2231
2232
76e3520e 2233STATIC I32
1de06328
YO
2234S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2235 I32 *minlenp, I32 *deltap,
9a957fbc 2236 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2237 /* scanp: Start here (read-write). */
2238 /* deltap: Write maxlen-minlen here. */
2239 /* last: Stop before this one. */
2240{
97aff369 2241 dVAR;
c277df42
IZ
2242 I32 min = 0, pars = 0, code;
2243 regnode *scan = *scanp, *next;
2244 I32 delta = 0;
2245 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2246 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2247 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2248 scan_data_t data_fake;
653099ff 2249 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2250 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2251 regnode *first_non_open = scan;
2252
a3621e74
YO
2253
2254 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2255#ifdef DEBUGGING
2256 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2257#endif
786e8c11
YO
2258 if ( depth == 0 ) {
2259 while (first_non_open && OP(first_non_open) == OPEN)
2260 first_non_open=regnext(first_non_open);
2261 }
2262
b81d288d 2263
c277df42
IZ
2264 while (scan && OP(scan) != END && scan < last) {
2265 /* Peephole optimizer: */
1de06328 2266 DEBUG_STUDYDATA(data,depth);
07be1b83 2267 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2268 JOIN_EXACT(scan,&min,0);
a3621e74 2269
653099ff
GS
2270 /* Follow the next-chain of the current node and optimize
2271 away all the NOTHINGs from it. */
c277df42 2272 if (OP(scan) != CURLYX) {
a3b680e6 2273 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2274 ? I32_MAX
2275 /* I32 may be smaller than U16 on CRAYs! */
2276 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2277 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2278 int noff;
2279 regnode *n = scan;
b81d288d 2280
c277df42
IZ
2281 /* Skip NOTHING and LONGJMP. */
2282 while ((n = regnext(n))
3dab1dad 2283 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2284 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2285 && off + noff < max)
2286 off += noff;
2287 if (reg_off_by_arg[OP(scan)])
2288 ARG(scan) = off;
b81d288d 2289 else
c277df42
IZ
2290 NEXT_OFF(scan) = off;
2291 }
a3621e74 2292
07be1b83 2293
3dab1dad 2294
653099ff
GS
2295 /* The principal pseudo-switch. Cannot be a switch, since we
2296 look into several different things. */
b81d288d 2297 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2298 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2299 next = regnext(scan);
2300 code = OP(scan);
a3621e74 2301 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2302
2303 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2304 /* NOTE - There is similar code to this block below for handling
2305 TRIE nodes on a re-study. If you change stuff here check there
2306 too. */
c277df42 2307 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2308 struct regnode_charclass_class accum;
d4c19fe8 2309 regnode * const startbranch=scan;
c277df42 2310
653099ff 2311 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2312 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2313 if (flags & SCF_DO_STCLASS)
830247a4 2314 cl_init_zero(pRExC_state, &accum);
a3621e74 2315
c277df42 2316 while (OP(scan) == code) {
830247a4 2317 I32 deltanext, minnext, f = 0, fake;
653099ff 2318 struct regnode_charclass_class this_class;
c277df42
IZ
2319
2320 num++;
2321 data_fake.flags = 0;
b81d288d 2322 if (data) {
2c2d71f5 2323 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2324 data_fake.last_closep = data->last_closep;
2325 }
2326 else
2327 data_fake.last_closep = &fake;
c277df42
IZ
2328 next = regnext(scan);
2329 scan = NEXTOPER(scan);
2330 if (code != BRANCH)
2331 scan = NEXTOPER(scan);
653099ff 2332 if (flags & SCF_DO_STCLASS) {
830247a4 2333 cl_init(pRExC_state, &this_class);
653099ff
GS
2334 data_fake.start_class = &this_class;
2335 f = SCF_DO_STCLASS_AND;
b81d288d 2336 }
e1901655
IZ
2337 if (flags & SCF_WHILEM_VISITED_POS)
2338 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2339
653099ff 2340 /* we suppose the run is continuous, last=next...*/
1de06328 2341 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2342 next, &data_fake, f,depth+1);
b81d288d 2343 if (min1 > minnext)
c277df42
IZ
2344 min1 = minnext;
2345 if (max1 < minnext + deltanext)
2346 max1 = minnext + deltanext;
2347 if (deltanext == I32_MAX)
aca2d497 2348 is_inf = is_inf_internal = 1;
c277df42
IZ
2349 scan = next;
2350 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2351 pars++;
3dab1dad
YO
2352 if (data) {
2353 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2354 data->flags |= SF_HAS_EVAL;
2c2d71f5 2355 data->whilem_c = data_fake.whilem_c;
3dab1dad 2356 }
653099ff 2357 if (flags & SCF_DO_STCLASS)
830247a4 2358 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2359 if (code == SUSPEND)
c277df42
IZ
2360 break;
2361 }
2362 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2363 min1 = 0;
2364 if (flags & SCF_DO_SUBSTR) {
2365 data->pos_min += min1;
2366 data->pos_delta += max1 - min1;
2367 if (max1 != min1 || is_inf)
2368 data->longest = &(data->longest_float);
2369 }
2370 min += min1;
2371 delta += max1 - min1;
653099ff 2372 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2373 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2374 if (min1) {
2375 cl_and(data->start_class, &and_with);
2376 flags &= ~SCF_DO_STCLASS;
2377 }
2378 }
2379 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2380 if (min1) {
2381 cl_and(data->start_class, &accum);
653099ff 2382 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2383 }
2384 else {
b81d288d 2385 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2386 * data->start_class */
2387 StructCopy(data->start_class, &and_with,
2388 struct regnode_charclass_class);
2389 flags &= ~SCF_DO_STCLASS_AND;
2390 StructCopy(&accum, data->start_class,
2391 struct regnode_charclass_class);
2392 flags |= SCF_DO_STCLASS_OR;
2393 data->start_class->flags |= ANYOF_EOS;
2394 }
653099ff 2395 }
a3621e74 2396
786e8c11 2397 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2398 /* demq.
2399
2400 Assuming this was/is a branch we are dealing with: 'scan' now
2401 points at the item that follows the branch sequence, whatever
2402 it is. We now start at the beginning of the sequence and look
2403 for subsequences of
2404
786e8c11
YO
2405 BRANCH->EXACT=>x1
2406 BRANCH->EXACT=>x2
2407 tail
a3621e74
YO
2408
2409 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2410
2411 If we can find such a subseqence we need to turn the first
2412 element into a trie and then add the subsequent branch exact
2413 strings to the trie.
2414
2415 We have two cases
2416
786e8c11 2417 1. patterns where the whole set of branch can be converted.
a3621e74 2418
786e8c11 2419 2. patterns where only a subset can be converted.
a3621e74
YO
2420
2421 In case 1 we can replace the whole set with a single regop
2422 for the trie. In case 2 we need to keep the start and end
2423 branchs so
2424
2425 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2426 becomes BRANCH TRIE; BRANCH X;
2427
786e8c11
YO
2428 There is an additional case, that being where there is a
2429 common prefix, which gets split out into an EXACT like node
2430 preceding the TRIE node.
2431
2432 If x(1..n)==tail then we can do a simple trie, if not we make
2433 a "jump" trie, such that when we match the appropriate word
2434 we "jump" to the appopriate tail node. Essentailly we turn
2435 a nested if into a case structure of sorts.
a3621e74
YO
2436
2437 */
786e8c11 2438
3dab1dad 2439 int made=0;
0111c4fd
RGS
2440 if (!re_trie_maxbuff) {
2441 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2442 if (!SvIOK(re_trie_maxbuff))
2443 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2444 }
786e8c11 2445 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2446 regnode *cur;
2447 regnode *first = (regnode *)NULL;
2448 regnode *last = (regnode *)NULL;
2449 regnode *tail = scan;
2450 U8 optype = 0;
2451 U32 count=0;
2452
2453#ifdef DEBUGGING
c445ea15 2454 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2455#endif
2456 /* var tail is used because there may be a TAIL
2457 regop in the way. Ie, the exacts will point to the
2458 thing following the TAIL, but the last branch will
2459 point at the TAIL. So we advance tail. If we
2460 have nested (?:) we may have to move through several
2461 tails.
2462 */
2463
2464 while ( OP( tail ) == TAIL ) {
2465 /* this is the TAIL generated by (?:) */
2466 tail = regnext( tail );
2467 }
2468
3dab1dad 2469
a3621e74 2470 DEBUG_OPTIMISE_r({
32fc9b6a 2471 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2472 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2473 (int)depth * 2 + 2, "",
2474 "Looking for TRIE'able sequences. Tail node is: ",
2475 SvPV_nolen_const( mysv )
a3621e74
YO
2476 );
2477 });
3dab1dad 2478
a3621e74
YO
2479 /*
2480
2481 step through the branches, cur represents each
2482 branch, noper is the first thing to be matched
2483 as part of that branch and noper_next is the
2484 regnext() of that node. if noper is an EXACT
2485 and noper_next is the same as scan (our current
2486 position in the regex) then the EXACT branch is
2487 a possible optimization target. Once we have
2488 two or more consequetive such branches we can
2489 create a trie of the EXACT's contents and stich
2490 it in place. If the sequence represents all of
2491 the branches we eliminate the whole thing and
2492 replace it with a single TRIE. If it is a
2493 subsequence then we need to stitch it in. This
2494 means the first branch has to remain, and needs
2495 to be repointed at the item on the branch chain
2496 following the last branch optimized. This could
2497 be either a BRANCH, in which case the
2498 subsequence is internal, or it could be the
2499 item following the branch sequence in which
2500 case the subsequence is at the end.
2501
2502 */
2503
2504 /* dont use tail as the end marker for this traverse */
2505 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2506 regnode * const noper = NEXTOPER( cur );
be981c67 2507#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2508 regnode * const noper_next = regnext( noper );
be981c67 2509#endif
a3621e74 2510
a3621e74 2511 DEBUG_OPTIMISE_r({
32fc9b6a 2512 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2513 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2514 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2515
32fc9b6a 2516 regprop(RExC_rx, mysv, noper);
a3621e74 2517 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2518 SvPV_nolen_const(mysv));
a3621e74
YO
2519
2520 if ( noper_next ) {
32fc9b6a 2521 regprop(RExC_rx, mysv, noper_next );
a3621e74 2522 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2523 SvPV_nolen_const(mysv));
a3621e74 2524 }
3dab1dad
YO
2525 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2526 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2527 });
3dab1dad
YO
2528 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2529 : PL_regkind[ OP( noper ) ] == EXACT )
2530 || OP(noper) == NOTHING )
786e8c11
YO
2531#ifdef NOJUMPTRIE
2532 && noper_next == tail
2533#endif
2534 && count < U16_MAX)
a3621e74
YO
2535 {
2536 count++;
3dab1dad
YO
2537 if ( !first || optype == NOTHING ) {
2538 if (!first) first = cur;
a3621e74
YO
2539 optype = OP( noper );
2540 } else {
a3621e74 2541 last = cur;
a3621e74
YO
2542 }
2543 } else {
2544 if ( last ) {
786e8c11
YO
2545 make_trie( pRExC_state,
2546 startbranch, first, cur, tail, count,
2547 optype, depth+1 );
a3621e74 2548 }
3dab1dad 2549 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2550#ifdef NOJUMPTRIE
2551 && noper_next == tail
2552#endif
2553 ){
a3621e74
YO
2554 count = 1;
2555 first = cur;
2556 optype = OP( noper );
2557 } else {
2558 count = 0;
2559 first = NULL;
2560 optype = 0;
2561 }
2562 last = NULL;
2563 }
2564 }
2565 DEBUG_OPTIMISE_r({
32fc9b6a 2566 regprop(RExC_rx, mysv, cur);
a3621e74 2567 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2568 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2569 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2570
2571 });
2572 if ( last ) {
786e8c11 2573 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2574#ifdef TRIE_STUDY_OPT
786e8c11
YO
2575 if ( ((made == MADE_EXACT_TRIE &&
2576 startbranch == first)
2577 || ( first_non_open == first )) &&
2578 depth==0 )
2579 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2580#endif
07be1b83 2581 }
a3621e74 2582 }
3dab1dad
YO
2583
2584 } /* do trie */
786e8c11 2585
a0ed51b3 2586 }
a3621e74 2587 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2588 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2589 } else /* single branch is optimized. */
c277df42
IZ
2590 scan = NEXTOPER(scan);
2591 continue;
a0ed51b3
LW
2592 }
2593 else if (OP(scan) == EXACT) {
cd439c50 2594 I32 l = STR_LEN(scan);
c445ea15 2595 UV uc;
a0ed51b3 2596 if (UTF) {
a3b680e6 2597 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2598 l = utf8_length(s, s + l);
9041c2e3 2599 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2600 } else {
2601 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2602 }
2603 min += l;
c277df42 2604 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2605 /* The code below prefers earlier match for fixed
2606 offset, later match for variable offset. */
2607 if (data->last_end == -1) { /* Update the start info. */
2608 data->last_start_min = data->pos_min;
2609 data->last_start_max = is_inf
b81d288d 2610 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2611 }
cd439c50 2612 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2613 if (UTF)
2614 SvUTF8_on(data->last_found);
0eda9292 2615 {
9a957fbc 2616 SV * const sv = data->last_found;
a28509cc 2617 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2618 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2619 if (mg && mg->mg_len >= 0)
5e43f467
JH
2620 mg->mg_len += utf8_length((U8*)STRING(scan),
2621 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2622 }
c277df42
IZ
2623 data->last_end = data->pos_min + l;
2624 data->pos_min += l; /* As in the first entry. */
2625 data->flags &= ~SF_BEFORE_EOL;
2626 }
653099ff
GS
2627 if (flags & SCF_DO_STCLASS_AND) {
2628 /* Check whether it is compatible with what we know already! */
2629 int compat = 1;
2630
1aa99e6b 2631 if (uc >= 0x100 ||
516a5887 2632 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2633 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2634 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2635 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2636 )
653099ff
GS
2637 compat = 0;
2638 ANYOF_CLASS_ZERO(data->start_class);
2639 ANYOF_BITMAP_ZERO(data->start_class);
2640 if (compat)
1aa99e6b 2641 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2642 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2643 if (uc < 0x100)
2644 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2645 }
2646 else if (flags & SCF_DO_STCLASS_OR) {
2647 /* false positive possible if the class is case-folded */
1aa99e6b 2648 if (uc < 0x100)
9b877dbb
IH
2649 ANYOF_BITMAP_SET(data->start_class, uc);
2650 else
2651 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2652 data->start_class->flags &= ~ANYOF_EOS;
2653 cl_and(data->start_class, &and_with);
2654 }
2655 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2656 }
3dab1dad 2657 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2658 I32 l = STR_LEN(scan);
1aa99e6b 2659 UV uc = *((U8*)STRING(scan));
653099ff
GS
2660
2661 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2662 if (flags & SCF_DO_SUBSTR) {
2663 assert(data);
1de06328 2664 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2665 }
a0ed51b3 2666 if (UTF) {
6136c704 2667 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2668 l = utf8_length(s, s + l);
9041c2e3 2669 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2670 }
2671 min += l;
ecaa9b9c 2672 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2673 data->pos_min += l;
653099ff
GS
2674 if (flags & SCF_DO_STCLASS_AND) {
2675 /* Check whether it is compatible with what we know already! */
2676 int compat = 1;
2677
1aa99e6b 2678 if (uc >= 0x100 ||
516a5887 2679 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2680 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2681 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2682 compat = 0;
2683 ANYOF_CLASS_ZERO(data->start_class);
2684 ANYOF_BITMAP_ZERO(data->start_class);
2685 if (compat) {
1aa99e6b 2686 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2687 data->start_class->flags &= ~ANYOF_EOS;
2688 data->start_class->flags |= ANYOF_FOLD;
2689 if (OP(scan) == EXACTFL)
2690 data->start_class->flags |= ANYOF_LOCALE;
2691 }
2692 }
2693 else if (flags & SCF_DO_STCLASS_OR) {
2694 if (data->start_class->flags & ANYOF_FOLD) {
2695 /* false positive possible if the class is case-folded.
2696 Assume that the locale settings are the same... */
1aa99e6b
IH
2697 if (uc < 0x100)
2698 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2699 data->start_class->flags &= ~ANYOF_EOS;
2700 }
2701 cl_and(data->start_class, &and_with);
2702 }
2703 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2704 }
bfed75c6 2705 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2706 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2707 I32 f = flags, pos_before = 0;
d4c19fe8 2708 regnode * const oscan = scan;
653099ff
GS
2709 struct regnode_charclass_class this_class;
2710 struct regnode_charclass_class *oclass = NULL;
727f22e3 2711 I32 next_is_eval = 0;
653099ff 2712
3dab1dad 2713 switch (PL_regkind[OP(scan)]) {
653099ff 2714 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2715 scan = NEXTOPER(scan);
2716 goto finish;
2717 case PLUS:
653099ff 2718 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2719 next = NEXTOPER(scan);
653099ff 2720 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2721 mincount = 1;
2722 maxcount = REG_INFTY;
c277df42
IZ
2723 next = regnext(scan);
2724 scan = NEXTOPER(scan);
2725 goto do_curly;
2726 }
2727 }
2728 if (flags & SCF_DO_SUBSTR)
2729 data->pos_min++;
2730 min++;
2731 /* Fall through. */
2732 case STAR:
653099ff
GS
2733 if (flags & SCF_DO_STCLASS) {
2734 mincount = 0;
b81d288d 2735 maxcount = REG_INFTY;
653099ff
GS
2736 next = regnext(scan);
2737 scan = NEXTOPER(scan);
2738 goto do_curly;
2739 }
b81d288d 2740 is_inf = is_inf_internal = 1;
c277df42
IZ
2741 scan = regnext(scan);
2742 if (flags & SCF_DO_SUBSTR) {
1de06328 2743 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2744 data->longest = &(data->longest_float);
2745 }
2746 goto optimize_curly_tail;
2747 case CURLY:
b81d288d 2748 mincount = ARG1(scan);
c277df42
IZ
2749 maxcount = ARG2(scan);
2750 next = regnext(scan);
cb434fcc
IZ
2751 if (OP(scan) == CURLYX) {
2752 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2753 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2754 }
c277df42 2755 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2756 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2757 do_curly:
2758 if (flags & SCF_DO_SUBSTR) {
1de06328 2759 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2760 pos_before = data->pos_min;
2761 }
2762 if (data) {
2763 fl = data->flags;
2764 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2765 if (is_inf)
2766 data->flags |= SF_IS_INF;
2767 }
653099ff 2768 if (flags & SCF_DO_STCLASS) {
830247a4 2769 cl_init(pRExC_state, &this_class);
653099ff
GS
2770 oclass = data->start_class;
2771 data->start_class = &this_class;
2772 f |= SCF_DO_STCLASS_AND;
2773 f &= ~SCF_DO_STCLASS_OR;
2774 }
e1901655
IZ
2775 /* These are the cases when once a subexpression
2776 fails at a particular position, it cannot succeed
2777 even after backtracking at the enclosing scope.
b81d288d 2778
e1901655
IZ
2779 XXXX what if minimal match and we are at the
2780 initial run of {n,m}? */
2781 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2782 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2783
c277df42 2784 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2785 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2786 (mincount == 0
2787 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2788
2789 if (flags & SCF_DO_STCLASS)
2790 data->start_class = oclass;
2791 if (mincount == 0 || minnext == 0) {
2792 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2793 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2794 }
2795 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2796 /* Switch to OR mode: cache the old value of
653099ff
GS
2797 * data->start_class */
2798 StructCopy(data->start_class, &and_with,
2799 struct regnode_charclass_class);
2800 flags &= ~SCF_DO_STCLASS_AND;
2801 StructCopy(&this_class, data->start_class,
2802 struct regnode_charclass_class);
2803 flags |= SCF_DO_STCLASS_OR;
2804 data->start_class->flags |= ANYOF_EOS;
2805 }
2806 } else { /* Non-zero len */
2807 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2808 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2809 cl_and(data->start_class, &and_with);
2810 }
2811 else if (flags & SCF_DO_STCLASS_AND)
2812 cl_and(data->start_class, &this_class);
2813 flags &= ~SCF_DO_STCLASS;
2814 }
c277df42
IZ
2815 if (!scan) /* It was not CURLYX, but CURLY. */
2816 scan = next;
041457d9
DM
2817 if ( /* ? quantifier ok, except for (?{ ... }) */
2818 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2819 && (minnext == 0) && (deltanext == 0)
99799961 2820 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2821 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2822 && ckWARN(WARN_REGEXP))
b45f050a 2823 {
830247a4 2824 vWARN(RExC_parse,
b45f050a
JF
2825 "Quantifier unexpected on zero-length expression");
2826 }
2827
c277df42 2828 min += minnext * mincount;
b81d288d 2829 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2830 && (minnext + deltanext) > 0)
2831 || deltanext == I32_MAX);
aca2d497 2832 is_inf |= is_inf_internal;
c277df42
IZ
2833 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2834
2835 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2836 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2837 && data->flags & SF_IN_PAR
2838 && !(data->flags & SF_HAS_EVAL)
2839 && !deltanext && minnext == 1 ) {
2840 /* Try to optimize to CURLYN. */
2841 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2842 regnode * const nxt1 = nxt;
497b47a8
JH
2843#ifdef DEBUGGING
2844 regnode *nxt2;
2845#endif
c277df42
IZ
2846
2847 /* Skip open. */
2848 nxt = regnext(nxt);
bfed75c6 2849 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2850 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2851 && STR_LEN(nxt) == 1))
c277df42 2852 goto nogo;
497b47a8 2853#ifdef DEBUGGING
c277df42 2854 nxt2 = nxt;
497b47a8 2855#endif
c277df42 2856 nxt = regnext(nxt);
b81d288d 2857 if (OP(nxt) != CLOSE)
c277df42
IZ
2858 goto nogo;
2859 /* Now we know that nxt2 is the only contents: */
eb160463 2860 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2861 OP(oscan) = CURLYN;
2862 OP(nxt1) = NOTHING; /* was OPEN. */
2863#ifdef DEBUGGING
2864 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2865 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2866 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2867 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2868 OP(nxt + 1) = OPTIMIZED; /* was count. */
2869 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2870#endif
c277df42 2871 }
c277df42
IZ
2872 nogo:
2873
2874 /* Try optimization CURLYX => CURLYM. */
b81d288d 2875 if ( OP(oscan) == CURLYX && data
c277df42 2876 && !(data->flags & SF_HAS_PAR)
c277df42 2877 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2878 && !deltanext /* atom is fixed width */
2879 && minnext != 0 /* CURLYM can't handle zero width */
2880 ) {
c277df42
IZ
2881 /* XXXX How to optimize if data == 0? */
2882 /* Optimize to a simpler form. */
2883 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2884 regnode *nxt2;
2885
2886 OP(oscan) = CURLYM;
2887 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2888 && (OP(nxt2) != WHILEM))
c277df42
IZ
2889 nxt = nxt2;
2890 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2891 /* Need to optimize away parenths. */
2892 if (data->flags & SF_IN_PAR) {
2893 /* Set the parenth number. */
2894 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2895
b81d288d 2896 if (OP(nxt) != CLOSE)
b45f050a 2897 FAIL("Panic opt close");
eb160463 2898 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2899 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2900 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2901#ifdef DEBUGGING
2902 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2903 OP(nxt + 1) = OPTIMIZED; /* was count. */
2904 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2905 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2906#endif
c277df42
IZ
2907#if 0
2908 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2909 regnode *nnxt = regnext(nxt1);
b81d288d 2910
c277df42
IZ
2911 if (nnxt == nxt) {
2912 if (reg_off_by_arg[OP(nxt1)])
2913 ARG_SET(nxt1, nxt2 - nxt1);
2914 else if (nxt2 - nxt1 < U16_MAX)
2915 NEXT_OFF(nxt1) = nxt2 - nxt1;
2916 else
2917 OP(nxt) = NOTHING; /* Cannot beautify */
2918 }
2919 nxt1 = nnxt;
2920 }
2921#endif
2922 /* Optimize again: */
1de06328 2923 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2924 NULL, 0,depth+1);
a0ed51b3
LW
2925 }
2926 else
c277df42 2927 oscan->flags = 0;
c277df42 2928 }
e1901655
IZ
2929 else if ((OP(oscan) == CURLYX)
2930 && (flags & SCF_WHILEM_VISITED_POS)
2931 /* See the comment on a similar expression above.
2932 However, this time it not a subexpression
2933 we care about, but the expression itself. */
2934 && (maxcount == REG_INFTY)
2935 && data && ++data->whilem_c < 16) {
2936 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2937 /* Find WHILEM (as in regexec.c) */
2938 regnode *nxt = oscan + NEXT_OFF(oscan);
2939
2940 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2941 nxt += ARG(nxt);
eb160463
GS
2942 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2943 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2944 }
b81d288d 2945 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2946 pars++;
2947 if (flags & SCF_DO_SUBSTR) {
c445ea15 2948 SV *last_str = NULL;
c277df42
IZ
2949 int counted = mincount != 0;
2950
2951 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2952#if defined(SPARC64_GCC_WORKAROUND)
2953 I32 b = 0;
2954 STRLEN l = 0;
cfd0369c 2955 const char *s = NULL;
5d1c421c
JH
2956 I32 old = 0;
2957
2958 if (pos_before >= data->last_start_min)
2959 b = pos_before;
2960 else
2961 b = data->last_start_min;
2962
2963 l = 0;
cfd0369c 2964 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2965 old = b - data->last_start_min;
2966
2967#else
b81d288d 2968 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2969 ? pos_before : data->last_start_min;
2970 STRLEN l;
d4c19fe8 2971 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2972 I32 old = b - data->last_start_min;
5d1c421c 2973#endif
a0ed51b3
LW
2974
2975 if (UTF)
2976 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2977
a0ed51b3 2978 l -= old;
c277df42 2979 /* Get the added string: */
79cb57f6 2980 last_str = newSVpvn(s + old, l);
0e933229
IH
2981 if (UTF)
2982 SvUTF8_on(last_str);
c277df42
IZ
2983 if (deltanext == 0 && pos_before == b) {
2984 /* What was added is a constant string */
2985 if (mincount > 1) {
2986 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2987 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2988 SvPVX_const(last_str), l, mincount - 1);
b162af07 2989 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2990 /* Add additional parts. */
b81d288d 2991 SvCUR_set(data->last_found,
c277df42
IZ
2992 SvCUR(data->last_found) - l);
2993 sv_catsv(data->last_found, last_str);
0eda9292
JH
2994 {
2995 SV * sv = data->last_found;
2996 MAGIC *mg =
2997 SvUTF8(sv) && SvMAGICAL(sv) ?
2998 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2999 if (mg && mg->mg_len >= 0)
3000 mg->mg_len += CHR_SVLEN(last_str);
3001 }
c277df42
IZ
3002 data->last_end += l * (mincount - 1);
3003 }
2a8d9689
HS
3004 } else {
3005 /* start offset must point into the last copy */
3006 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3007 data->last_start_max += is_inf ? I32_MAX
3008 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3009 }
3010 }
3011 /* It is counted once already... */
3012 data->pos_min += minnext * (mincount - counted);
3013 data->pos_delta += - counted * deltanext +
3014 (minnext + deltanext) * maxcount - minnext * mincount;
3015 if (mincount != maxcount) {
653099ff
GS
3016 /* Cannot extend fixed substrings found inside
3017 the group. */
1de06328 3018 scan_commit(pRExC_state,data,minlenp);
c277df42 3019 if (mincount && last_str) {
d4c19fe8
AL
3020 SV * const sv = data->last_found;
3021 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3022 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3023
3024 if (mg)
3025 mg->mg_len = -1;
3026 sv_setsv(sv, last_str);
c277df42 3027 data->last_end = data->pos_min;
b81d288d 3028 data->last_start_min =
a0ed51b3 3029 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3030 data->last_start_max = is_inf
3031 ? I32_MAX
c277df42 3032 : data->pos_min + data->pos_delta
a0ed51b3 3033 - CHR_SVLEN(last_str);
c277df42
IZ
3034 }
3035 data->longest = &(data->longest_float);
3036 }
aca2d497 3037 SvREFCNT_dec(last_str);
c277df42 3038 }
405ff068 3039 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3040 data->flags |= SF_HAS_EVAL;
3041 optimize_curly_tail:
c277df42 3042 if (OP(oscan) != CURLYX) {
3dab1dad 3043 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3044 && NEXT_OFF(next))
3045 NEXT_OFF(oscan) += NEXT_OFF(next);
3046 }
c277df42 3047 continue;
653099ff 3048 default: /* REF and CLUMP only? */
c277df42 3049 if (flags & SCF_DO_SUBSTR) {
1de06328 3050 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3051 data->longest = &(data->longest_float);
3052 }
aca2d497 3053 is_inf = is_inf_internal = 1;
653099ff 3054 if (flags & SCF_DO_STCLASS_OR)
830247a4 3055 cl_anything(pRExC_state, data->start_class);
653099ff 3056 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3057 break;
3058 }
a0ed51b3 3059 }
bfed75c6 3060 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3061 int value = 0;
653099ff 3062
c277df42 3063 if (flags & SCF_DO_SUBSTR) {
1de06328 3064 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3065 data->pos_min++;
3066 }
3067 min++;
653099ff
GS
3068 if (flags & SCF_DO_STCLASS) {
3069 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3070
3071 /* Some of the logic below assumes that switching
3072 locale on will only add false positives. */
3dab1dad 3073 switch (PL_regkind[OP(scan)]) {
653099ff 3074 case SANY:
653099ff
GS
3075 default:
3076 do_default:
3077 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3078 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3079 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3080 break;
3081 case REG_ANY:
3082 if (OP(scan) == SANY)
3083 goto do_default;
3084 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3085 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3086 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3087 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3088 }
3089 if (flags & SCF_DO_STCLASS_AND || !value)
3090 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3091 break;
3092 case ANYOF:
3093 if (flags & SCF_DO_STCLASS_AND)
3094 cl_and(data->start_class,
3095 (struct regnode_charclass_class*)scan);
3096 else
830247a4 3097 cl_or(pRExC_state, data->start_class,
653099ff
GS
3098 (struct regnode_charclass_class*)scan);
3099 break;
3100 case ALNUM:
3101 if (flags & SCF_DO_STCLASS_AND) {
3102 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3103 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3104 for (value = 0; value < 256; value++)
3105 if (!isALNUM(value))
3106 ANYOF_BITMAP_CLEAR(data->start_class, value);
3107 }
3108 }
3109 else {
3110 if (data->start_class->flags & ANYOF_LOCALE)
3111 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3112 else {
3113 for (value = 0; value < 256; value++)
3114 if (isALNUM(value))
b81d288d 3115 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3116 }
3117 }
3118 break;
3119 case ALNUML:
3120 if (flags & SCF_DO_STCLASS_AND) {
3121 if (data->start_class->flags & ANYOF_LOCALE)
3122 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3123 }
3124 else {
3125 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3126 data->start_class->flags |= ANYOF_LOCALE;
3127 }
3128 break;
3129 case NALNUM:
3130 if (flags & SCF_DO_STCLASS_AND) {
3131 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3132 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3133 for (value = 0; value < 256; value++)
3134 if (isALNUM(value))
3135 ANYOF_BITMAP_CLEAR(data->start_class, value);
3136 }
3137 }
3138 else {
3139 if (data->start_class->flags & ANYOF_LOCALE)
3140 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3141 else {
3142 for (value = 0; value < 256; value++)
3143 if (!isALNUM(value))
b81d288d 3144 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3145 }
3146 }
3147 break;
3148 case NALNUML:
3149 if (flags & SCF_DO_STCLASS_AND) {
3150 if (data->start_class->flags & ANYOF_LOCALE)
3151 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3152 }
3153 else {
3154 data->start_class->flags |= ANYOF_LOCALE;
3155 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3156 }
3157 break;
3158 case SPACE:
3159 if (flags & SCF_DO_STCLASS_AND) {
3160 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3161 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3162 for (value = 0; value < 256; value++)
3163 if (!isSPACE(value))
3164 ANYOF_BITMAP_CLEAR(data->start_class, value);
3165 }
3166 }
3167 else {
3168 if (data->start_class->flags & ANYOF_LOCALE)
3169 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3170 else {
3171 for (value = 0; value < 256; value++)
3172 if (isSPACE(value))
b81d288d 3173 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3174 }
3175 }
3176 break;
3177 case SPACEL:
3178 if (flags & SCF_DO_STCLASS_AND) {
3179 if (data->start_class->flags & ANYOF_LOCALE)
3180 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3181 }
3182 else {
3183 data->start_class->flags |= ANYOF_LOCALE;
3184 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3185 }
3186 break;
3187 case NSPACE:
3188 if (flags & SCF_DO_STCLASS_AND) {
3189 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3190 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3191 for (value = 0; value < 256; value++)
3192 if (isSPACE(value))
3193 ANYOF_BITMAP_CLEAR(data->start_class, value);
3194 }
3195 }
3196 else {
3197 if (data->start_class->flags & ANYOF_LOCALE)
3198 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3199 else {
3200 for (value = 0; value < 256; value++)
3201 if (!isSPACE(value))
b81d288d 3202 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3203 }
3204 }
3205 break;
3206 case NSPACEL:
3207 if (flags & SCF_DO_STCLASS_AND) {
3208 if (data->start_class->flags & ANYOF_LOCALE) {
3209 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3210 for (value = 0; value < 256; value++)
3211 if (!isSPACE(value))
3212 ANYOF_BITMAP_CLEAR(data->start_class, value);
3213 }
3214 }
3215 else {
3216 data->start_class->flags |= ANYOF_LOCALE;
3217 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3218 }
3219 break;
3220 case DIGIT:
3221 if (flags & SCF_DO_STCLASS_AND) {
3222 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3223 for (value = 0; value < 256; value++)
3224 if (!isDIGIT(value))
3225 ANYOF_BITMAP_CLEAR(data->start_class, value);
3226 }
3227 else {
3228 if (data->start_class->flags & ANYOF_LOCALE)
3229 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3230 else {
3231 for (value = 0; value < 256; value++)
3232 if (isDIGIT(value))
b81d288d 3233 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3234 }
3235 }
3236 break;
3237 case NDIGIT:
3238 if (flags & SCF_DO_STCLASS_AND) {
3239 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3240 for (value = 0; value < 256; value++)
3241 if (isDIGIT(value))
3242 ANYOF_BITMAP_CLEAR(data->start_class, value);
3243 }
3244 else {
3245 if (data->start_class->flags & ANYOF_LOCALE)
3246 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3247 else {
3248 for (value = 0; value < 256; value++)
3249 if (!isDIGIT(value))
b81d288d 3250 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3251 }
3252 }
3253 break;
3254 }
3255 if (flags & SCF_DO_STCLASS_OR)
3256 cl_and(data->start_class, &and_with);
3257 flags &= ~SCF_DO_STCLASS;
3258 }
a0ed51b3 3259 }
3dab1dad 3260 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3261 data->flags |= (OP(scan) == MEOL
3262 ? SF_BEFORE_MEOL
3263 : SF_BEFORE_SEOL);
a0ed51b3 3264 }
3dab1dad 3265 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3266 /* Lookbehind, or need to calculate parens/evals/stclass: */
3267 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3268 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3269 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3270 || OP(scan) == UNLESSM )
3271 {
3272 /* Negative Lookahead/lookbehind
3273 In this case we can't do fixed string optimisation.
3274 */
3275
3276 I32 deltanext, minnext, fake = 0;
3277 regnode *nscan;
3278 struct regnode_charclass_class intrnl;
3279 int f = 0;
3280
3281 data_fake.flags = 0;
3282 if (data) {
3283 data_fake.whilem_c = data->whilem_c;
3284 data_fake.last_closep = data->last_closep;
a0ed51b3 3285 }
1de06328
YO
3286 else
3287 data_fake.last_closep = &fake;
3288 if ( flags & SCF_DO_STCLASS && !scan->flags
3289 && OP(scan) == IFMATCH ) { /* Lookahead */
3290 cl_init(pRExC_state, &intrnl);
3291 data_fake.start_class = &intrnl;
3292 f |= SCF_DO_STCLASS_AND;
c277df42 3293 }
1de06328
YO
3294 if (flags & SCF_WHILEM_VISITED_POS)
3295 f |= SCF_WHILEM_VISITED_POS;
3296 next = regnext(scan);
3297 nscan = NEXTOPER(NEXTOPER(scan));
3298 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3299 if (scan->flags) {
3300 if (deltanext) {
3301 vFAIL("Variable length lookbehind not implemented");
3302 }
3303 else if (minnext > (I32)U8_MAX) {
3304 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3305 }
3306 scan->flags = (U8)minnext;
3307 }
3308 if (data) {
3309 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3310 pars++;
3311 if (data_fake.flags & SF_HAS_EVAL)
3312 data->flags |= SF_HAS_EVAL;
3313 data->whilem_c = data_fake.whilem_c;
3314 }
3315 if (f & SCF_DO_STCLASS_AND) {
3316 const int was = (data->start_class->flags & ANYOF_EOS);
3317
3318 cl_and(data->start_class, &intrnl);
3319 if (was)
3320 data->start_class->flags |= ANYOF_EOS;
3321 }
be8e71aa 3322 }
1de06328
YO
3323#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3324 else {
3325 /* Positive Lookahead/lookbehind
3326 In this case we can do fixed string optimisation,
3327 but we must be careful about it. Note in the case of
3328 lookbehind the positions will be offset by the minimum
3329 length of the pattern, something we won't know about
3330 until after the recurse.
3331 */
3332 I32 deltanext, fake = 0;
3333 regnode *nscan;
3334 struct regnode_charclass_class intrnl;
3335 int f = 0;
3336 /* We use SAVEFREEPV so that when the full compile
3337 is finished perl will clean up the allocated
3338 minlens when its all done. This was we don't
3339 have to worry about freeing them when we know
3340 they wont be used, which would be a pain.
3341 */
3342 I32 *minnextp;
3343 Newx( minnextp, 1, I32 );
3344 SAVEFREEPV(minnextp);
3345
3346 if (data) {
3347 StructCopy(data, &data_fake, scan_data_t);
3348 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3349 f |= SCF_DO_SUBSTR;
3350 if (scan->flags)
3351 scan_commit(pRExC_state, &data_fake,minlenp);
3352 data_fake.last_found=newSVsv(data->last_found);
3353 }
3354 }
3355 else
3356 data_fake.last_closep = &fake;
3357 data_fake.flags = 0;
3358 if (is_inf)
3359 data_fake.flags |= SF_IS_INF;
3360 if ( flags & SCF_DO_STCLASS && !scan->flags
3361 && OP(scan) == IFMATCH ) { /* Lookahead */
3362 cl_init(pRExC_state, &intrnl);
3363 data_fake.start_class = &intrnl;
3364 f |= SCF_DO_STCLASS_AND;
3365 }
3366 if (flags & SCF_WHILEM_VISITED_POS)
3367 f |= SCF_WHILEM_VISITED_POS;
3368 next = regnext(scan);
3369 nscan = NEXTOPER(NEXTOPER(scan));
3370
3371 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3372 if (scan->flags) {
3373 if (deltanext) {
3374 vFAIL("Variable length lookbehind not implemented");
3375 }
3376 else if (*minnextp > (I32)U8_MAX) {
3377 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3378 }
3379 scan->flags = (U8)*minnextp;
3380 }
3381
3382 *minnextp += min;
3383
3384
3385 if (f & SCF_DO_STCLASS_AND) {
3386 const int was = (data->start_class->flags & ANYOF_EOS);
3387
3388 cl_and(data->start_class, &intrnl);
3389 if (was)
3390 data->start_class->flags |= ANYOF_EOS;
3391 }
3392 if (data) {
3393 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3394 pars++;
3395 if (data_fake.flags & SF_HAS_EVAL)
3396 data->flags |= SF_HAS_EVAL;
3397 data->whilem_c = data_fake.whilem_c;
3398 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3399 if (RExC_rx->minlen<*minnextp)
3400 RExC_rx->minlen=*minnextp;
3401 scan_commit(pRExC_state, &data_fake, minnextp);
3402 SvREFCNT_dec(data_fake.last_found);
3403
3404 if ( data_fake.minlen_fixed != minlenp )
3405 {
3406 data->offset_fixed= data_fake.offset_fixed;
3407 data->minlen_fixed= data_fake.minlen_fixed;
3408 data->lookbehind_fixed+= scan->flags;
3409 }
3410 if ( data_fake.minlen_float != minlenp )
3411 {
3412 data->minlen_float= data_fake.minlen_float;
3413 data->offset_float_min=data_fake.offset_float_min;
3414 data->offset_float_max=data_fake.offset_float_max;
3415 data->lookbehind_float+= scan->flags;
3416 }
3417 }
3418 }
3419
653099ff 3420
653099ff 3421 }
1de06328 3422#endif
a0ed51b3
LW
3423 }
3424 else if (OP(scan) == OPEN) {
c277df42 3425 pars++;
a0ed51b3 3426 }
cb434fcc 3427 else if (OP(scan) == CLOSE) {
eb160463 3428 if ((I32)ARG(scan) == is_par) {
cb434fcc 3429 next = regnext(scan);
c277df42 3430
cb434fcc
IZ
3431 if ( next && (OP(next) != WHILEM) && next < last)
3432 is_par = 0; /* Disable optimization */
3433 }
3434 if (data)
3435 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3436 }
3437 else if (OP(scan) == EVAL) {
c277df42
IZ
3438 if (data)
3439 data->flags |= SF_HAS_EVAL;
3440 }
96776eda 3441 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3442 if (flags & SCF_DO_SUBSTR) {
1de06328 3443 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3444 data->longest = &(data->longest_float);
3445 }
3446 is_inf = is_inf_internal = 1;
653099ff 3447 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3448 cl_anything(pRExC_state, data->start_class);
96776eda 3449 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3450 }
786e8c11
YO
3451#ifdef TRIE_STUDY_OPT
3452#ifdef FULL_TRIE_STUDY
3453 else if (PL_regkind[OP(scan)] == TRIE) {
3454 /* NOTE - There is similar code to this block above for handling
3455 BRANCH nodes on the initial study. If you change stuff here
3456 check there too. */
3457 regnode *tail= regnext(scan);
3458 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3459 I32 max1 = 0, min1 = I32_MAX;
3460 struct regnode_charclass_class accum;
3461
3462 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3463 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3464 if (flags & SCF_DO_STCLASS)
3465 cl_init_zero(pRExC_state, &accum);
3466
3467 if (!trie->jump) {
3468 min1= trie->minlen;
3469 max1= trie->maxlen;
3470 } else {
3471 const regnode *nextbranch= NULL;
3472 U32 word;
3473
3474 for ( word=1 ; word <= trie->wordcount ; word++)
3475 {
3476 I32 deltanext=0, minnext=0, f = 0, fake;
3477 struct regnode_charclass_class this_class;
3478
3479 data_fake.flags = 0;
3480 if (data) {
3481 data_fake.whilem_c = data->whilem_c;
3482 data_fake.last_closep = data->last_closep;
3483 }
3484 else
3485 data_fake.last_closep = &fake;
3486
3487 if (flags & SCF_DO_STCLASS) {
3488 cl_init(pRExC_state, &this_class);
3489 data_fake.start_class = &this_class;
3490 f = SCF_DO_STCLASS_AND;
3491 }
3492 if (flags & SCF_WHILEM_VISITED_POS)
3493 f |= SCF_WHILEM_VISITED_POS;
3494
3495 if (trie->jump[word]) {
3496 if (!nextbranch)
3497 nextbranch = tail - trie->jump[0];
3498 scan= tail - trie->jump[word];
3499 /* We go from the jump point to the branch that follows
3500 it. Note this means we need the vestigal unused branches
3501 even though they arent otherwise used.
3502 */
1de06328 3503 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3504 (regnode *)nextbranch, &data_fake, f,depth+1);
3505 }
3506 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3507 nextbranch= regnext((regnode*)nextbranch);
3508
3509 if (min1 > (I32)(minnext + trie->minlen))
3510 min1 = minnext + trie->minlen;
3511 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3512 max1 = minnext + deltanext + trie->maxlen;
3513 if (deltanext == I32_MAX)
3514 is_inf = is_inf_internal = 1;
3515
3516 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3517 pars++;
3518
3519 if (data) {
3520 if (data_fake.flags & SF_HAS_EVAL)
3521 data->flags |= SF_HAS_EVAL;
3522 data->whilem_c = data_fake.whilem_c;
3523 }
3524 if (flags & SCF_DO_STCLASS)
3525 cl_or(pRExC_state, &accum, &this_class);
3526 }
3527 }
3528 if (flags & SCF_DO_SUBSTR) {
3529 data->pos_min += min1;
3530 data->pos_delta += max1 - min1;
3531 if (max1 != min1 || is_inf)
3532 data->longest = &(data->longest_float);
3533 }
3534 min += min1;
3535 delta += max1 - min1;
3536 if (flags & SCF_DO_STCLASS_OR) {
3537 cl_or(pRExC_state, data->start_class, &accum);
3538 if (min1) {
3539 cl_and(data->start_class, &and_with);
3540 flags &= ~SCF_DO_STCLASS;
3541 }
3542 }
3543 else if (flags & SCF_DO_STCLASS_AND) {
3544 if (min1) {
3545 cl_and(data->start_class, &accum);
3546 flags &= ~SCF_DO_STCLASS;
3547 }
3548 else {
3549 /* Switch to OR mode: cache the old value of
3550 * data->start_class */
3551 StructCopy(data->start_class, &and_with,
3552 struct regnode_charclass_class);
3553 flags &= ~SCF_DO_STCLASS_AND;
3554 StructCopy(&accum, data->start_class,
3555 struct regnode_charclass_class);
3556 flags |= SCF_DO_STCLASS_OR;
3557 data->start_class->flags |= ANYOF_EOS;
3558 }
3559 }
3560 scan= tail;
3561 continue;
3562 }
3563#else
3564 else if (PL_regkind[OP(scan)] == TRIE) {
3565 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3566 U8*bang=NULL;
3567
3568 min += trie->minlen;
3569 delta += (trie->maxlen - trie->minlen);
3570 flags &= ~SCF_DO_STCLASS; /* xxx */
3571 if (flags & SCF_DO_SUBSTR) {
1de06328 3572 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3573 data->pos_min += trie->minlen;
3574 data->pos_delta += (trie->maxlen - trie->minlen);
3575 if (trie->maxlen != trie->minlen)
3576 data->longest = &(data->longest_float);
3577 }
3578 if (trie->jump) /* no more substrings -- for now /grr*/
3579 flags &= ~SCF_DO_SUBSTR;
3580 }
3581#endif /* old or new */
3582#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3583 /* Else: zero-length, ignore. */
3584 scan = regnext(scan);
3585 }
3586
3587 finish:
3588 *scanp = scan;
aca2d497 3589 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3590 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3591 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3592 if (is_par > (I32)U8_MAX)
c277df42
IZ
3593 is_par = 0;
3594 if (is_par && pars==1 && data) {
3595 data->flags |= SF_IN_PAR;
3596 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3597 }
3598 else if (pars && data) {
c277df42
IZ
3599 data->flags |= SF_HAS_PAR;
3600 data->flags &= ~SF_IN_PAR;
3601 }
653099ff
GS
3602 if (flags & SCF_DO_STCLASS_OR)
3603 cl_and(data->start_class, &and_with);
786e8c11
YO
3604 if (flags & SCF_TRIE_RESTUDY)
3605 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3606
3607 DEBUG_STUDYDATA(data,depth);
3608
c277df42
IZ
3609 return min;
3610}
3611
76e3520e 3612STATIC I32
5f66b61c 3613S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3614{
830247a4 3615 if (RExC_rx->data) {
b81d288d
AB
3616 Renewc(RExC_rx->data,
3617 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3618 char, struct reg_data);
830247a4
IZ
3619 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3620 RExC_rx->data->count += n;
a0ed51b3
LW
3621 }
3622 else {
a02a5408 3623 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3624 char, struct reg_data);
a02a5408 3625 Newx(RExC_rx->data->what, n, U8);
830247a4 3626 RExC_rx->data->count = n;
c277df42 3627 }
830247a4
IZ
3628 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3629 return RExC_rx->data->count - n;
c277df42
IZ
3630}
3631
76234dfb 3632#ifndef PERL_IN_XSUB_RE
d88dccdf 3633void
864dbfa3 3634Perl_reginitcolors(pTHX)
d88dccdf 3635{
97aff369 3636 dVAR;
1df70142 3637 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3638 if (s) {
1df70142
AL
3639 char *t = savepv(s);
3640 int i = 0;
3641 PL_colors[0] = t;
d88dccdf 3642 while (++i < 6) {
1df70142
AL
3643 t = strchr(t, '\t');
3644 if (t) {
3645 *t = '\0';
3646 PL_colors[i] = ++t;
d88dccdf
IZ
3647 }
3648 else
1df70142 3649 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3650 }
3651 } else {
1df70142 3652 int i = 0;
b81d288d 3653 while (i < 6)
06b5626a 3654 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3655 }
3656 PL_colorset = 1;
3657}
76234dfb 3658#endif
8615cb43 3659
07be1b83 3660
786e8c11
YO
3661#ifdef TRIE_STUDY_OPT
3662#define CHECK_RESTUDY_GOTO \
3663 if ( \
3664 (data.flags & SCF_TRIE_RESTUDY) \
3665 && ! restudied++ \
3666 ) goto reStudy
3667#else
3668#define CHECK_RESTUDY_GOTO
3669#endif
a687059c 3670/*
e50aee73 3671 - pregcomp - compile a regular expression into internal code
a687059c
LW
3672 *
3673 * We can't allocate space until we know how big the compiled form will be,
3674 * but we can't compile it (and thus know how big it is) until we've got a
3675 * place to put the code. So we cheat: we compile it twice, once with code
3676 * generation turned off and size counting turned on, and once "for real".
3677 * This also means that we don't allocate space until we are sure that the
3678 * thing really will compile successfully, and we never have to move the
3679 * code and thus invalidate pointers into it. (Note that it has to be in
3680 * one piece because free() must be able to free it all.) [NB: not true in perl]
3681 *
3682 * Beware that the optimization-preparation code in here knows about some
3683 * of the structure of the compiled regexp. [I'll say.]
3684 */
3685regexp *
864dbfa3 3686Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3687{
97aff369 3688 dVAR;
a0d0e21e 3689 register regexp *r;
c277df42 3690 regnode *scan;
c277df42 3691 regnode *first;
a0d0e21e 3692 I32 flags;
a0d0e21e
LW
3693 I32 minlen = 0;
3694 I32 sawplus = 0;
3695 I32 sawopen = 0;
2c2d71f5 3696 scan_data_t data;
830247a4 3697 RExC_state_t RExC_state;
be8e71aa 3698 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3699#ifdef TRIE_STUDY_OPT
3700 int restudied= 0;
3701 RExC_state_t copyRExC_state;
3702#endif
a0d0e21e 3703
a3621e74
YO
3704 GET_RE_DEBUG_FLAGS_DECL;
3705
a0d0e21e 3706 if (exp == NULL)
c277df42 3707 FAIL("NULL regexp argument");
a0d0e21e 3708
a5961de5 3709 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3710
5cfc7842 3711 RExC_precomp = exp;
a3621e74
YO
3712 DEBUG_r(if (!PL_colorset) reginitcolors());
3713 DEBUG_COMPILE_r({
ab3bbdeb
YO
3714 SV *dsv= sv_newmortal();
3715 RE_PV_QUOTED_DECL(s, RExC_utf8,
3716 dsv, RExC_precomp, (xend - exp), 60);
3717 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3718 PL_colors[4],PL_colors[5],s);
a5961de5 3719 });
e2509266 3720 RExC_flags = pm->op_pmflags;
830247a4 3721 RExC_sawback = 0;
bbce6d69 3722
830247a4