This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] Better version of the Aho-Corasick patch and lots of benchmarks.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad
YO
125#ifdef DEBUGGING
126 char *lastparse;
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
174#define TRIE_STCLASS
175#endif
2c2d71f5
JH
176/* Length of a variant. */
177
178typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
cb434fcc 195 I32 *last_closep;
653099ff 196 struct regnode_charclass_class *start_class;
2c2d71f5
JH
197} scan_data_t;
198
a687059c 199/*
e50aee73 200 * Forward declarations for pregcomp()'s friends.
a687059c 201 */
a0d0e21e 202
27da23d5
JH
203static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
205
206#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
207#define SF_BEFORE_SEOL 0x0001
208#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
209#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
09b7f37c
CB
212#ifdef NO_UNARY_PLUS
213# define SF_FIX_SHIFT_EOL (0+2)
214# define SF_FL_SHIFT_EOL (0+4)
215#else
216# define SF_FIX_SHIFT_EOL (+2)
217# define SF_FL_SHIFT_EOL (+4)
218#endif
c277df42
IZ
219
220#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
225#define SF_IS_INF 0x0040
226#define SF_HAS_PAR 0x0080
227#define SF_IN_PAR 0x0100
228#define SF_HAS_EVAL 0x0200
229#define SCF_DO_SUBSTR 0x0400
653099ff
GS
230#define SCF_DO_STCLASS_AND 0x0800
231#define SCF_DO_STCLASS_OR 0x1000
232#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 233#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 234
07be1b83
YO
235#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
07be1b83 393 * Position is 1 indexed.
fac92740
MJD
394 */
395
ccb2c380
MP
396#define Set_Node_Offset_To_R(node,byte) STMT_START { \
397 if (! SIZE_ONLY) { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 399 __LINE__, (node), (int)(byte))); \
ccb2c380 400 if((node) < 0) { \
551405c4 401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
402 } else { \
403 RExC_offsets[2*(node)-1] = (byte); \
404 } \
405 } \
406} STMT_END
407
408#define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
411
412#define Set_Node_Length_To_R(node,len) STMT_START { \
413 if (! SIZE_ONLY) { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 415 __LINE__, (int)(node), (int)(len))); \
ccb2c380 416 if((node) < 0) { \
551405c4 417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
418 } else { \
419 RExC_offsets[2*(node)] = (len); \
420 } \
421 } \
422} STMT_END
423
424#define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427#define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
429
430/* Get offsets and lengths */
431#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
433
07be1b83
YO
434#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
437} STMT_END
438
439
440#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441#define EXPERIMENTAL_INPLACESCAN
442#endif
443
acfe0abc 444static void clear_re(pTHX_ void *r);
4327152a 445
653099ff
GS
446/* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
449
4327152a 450STATIC void
097eb12c 451S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 452{
e1ec3a88
AL
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 455
c277df42 456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 457 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
b81d288d 461 data->flags
c277df42
IZ
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
463 else
464 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
465 }
466 else {
c277df42 467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
468 data->offset_float_max = (l
469 ? data->last_start_max
c277df42 470 : data->pos_min + data->pos_delta);
9051bda5
HS
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
c277df42 473 if (data->flags & SF_BEFORE_EOL)
b81d288d 474 data->flags
c277df42
IZ
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
476 else
477 data->flags &= ~SF_FL_BEFORE_EOL;
478 }
479 }
480 SvCUR_set(data->last_found, 0);
0eda9292 481 {
a28509cc 482 SV * const sv = data->last_found;
097eb12c
AL
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
485 if (mg)
486 mg->mg_len = 0;
487 }
0eda9292 488 }
c277df42
IZ
489 data->last_end = -1;
490 data->flags &= ~SF_BEFORE_EOL;
491}
492
653099ff
GS
493/* Can match anything (initialization) */
494STATIC void
097eb12c 495S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 496{
653099ff 497 ANYOF_CLASS_ZERO(cl);
f8bef550 498 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
500 if (LOC)
501 cl->flags |= ANYOF_LOCALE;
502}
503
504/* Can match anything (initialization) */
505STATIC int
5f66b61c 506S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
507{
508 int value;
509
aaa51d5e 510 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
512 return 1;
1aa99e6b
IH
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
514 return 0;
f8bef550
NC
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 return 0;
653099ff
GS
517 return 1;
518}
519
520/* Can match anything (initialization) */
521STATIC void
097eb12c 522S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
8ecf7187 524 Zero(cl, 1, struct regnode_charclass_class);
653099ff 525 cl->type = ANYOF;
830247a4 526 cl_anything(pRExC_state, cl);
653099ff
GS
527}
528
529STATIC void
097eb12c 530S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 531{
8ecf7187 532 Zero(cl, 1, struct regnode_charclass_class);
653099ff 533 cl->type = ANYOF;
830247a4 534 cl_anything(pRExC_state, cl);
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* 'And' a given class with another one. Can create false positives */
540/* We assume that cl is not inverted */
541STATIC void
5f66b61c 542S_cl_and(struct regnode_charclass_class *cl,
a28509cc 543 const struct regnode_charclass_class *and_with)
653099ff 544{
653099ff
GS
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
550 int i;
551
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
555 else
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
1aa99e6b 561
14ebb1a2
JH
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
567 }
14ebb1a2
JH
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 570 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 573 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
574}
575
576/* 'OR' a given class with another one. Can create false positives */
577/* We assume that cl is not inverted */
578STATIC void
097eb12c 579S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 580{
653099ff
GS
581 if (or_with->flags & ANYOF_INVERT) {
582 /* We do not use
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
590 */
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
594 int i;
595
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
599 else {
830247a4 600 cl_anything(pRExC_state, cl);
653099ff
GS
601 }
602 } else {
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 605 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
606 || (cl->flags & ANYOF_FOLD)) ) {
607 int i;
608
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
616 }
617 }
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 619 cl_anything(pRExC_state, cl);
653099ff
GS
620 }
621 }
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
624
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
633 }
653099ff
GS
634}
635
5d1c421c 636/*
a3621e74 637
3dab1dad 638 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 646 depth : indent depth
a3621e74
YO
647
648Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
649
650A trie is an N'ary tree where the branches are determined by digital
651decomposition of the key. IE, at the root node you look up the 1st character and
652follow that branch repeat until you find the end of the branches. Nodes can be
653marked as "accepting" meaning they represent a complete word. Eg:
654
655 /he|she|his|hers/
656
657would convert into the following structure. Numbers represent states, letters
658following numbers represent valid transitions on the letter from that state, if
659the number is in square brackets it represents an accepting state, otherwise it
660will be in parenthesis.
661
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
663 | |
664 | (2)
665 | |
666 (1) +-i->(6)-+-s->[7]
667 |
668 +-s->(3)-+-h->(4)-+-e->[5]
669
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
671
672This shows that when matching against the string 'hers' we will begin at state 1
673read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676single traverse. We store a mapping from accepting to state to which word was
677matched, and then when we have multiple possibilities we try to complete the
678rest of the regex in the order in which they occured in the alternation.
679
680The only prior NFA like behaviour that would be changed by the TRIE support is
681the silent ignoring of duplicate alternations which are of the form:
682
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
684
685Thus EVAL blocks follwing a trie may be called a different number of times with
686and without the optimisation. With the optimisations dupes will be silently
687ignored. This inconsistant behaviour of EVAL type nodes is well established as
688the following demonstrates:
689
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
691
692which prints out 'word' three times, but
693
694 'words'=~/(word|word|word)(?{ print $1 })S/
695
696which doesnt print it out at all. This is due to other optimisations kicking in.
697
698Example of what happens on a structural level:
699
700The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: BRANCH(8)
704 6: EXACT <ac>(16)
705 8: BRANCH(11)
706 9: EXACT <ad>(16)
707 11: BRANCH(14)
708 12: EXACT <ab>(16)
709 16: SUCCEED(0)
710 17: NOTHING(18)
711 18: END(0)
712
713This would be optimizable with startbranch=5, first=5, last=16, tail=16
714and should turn into:
715
716 1: CURLYM[1] {1,32767}(18)
717 5: TRIE(16)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
719 <ac>
720 <ad>
721 <ab>
722 16: SUCCEED(0)
723 17: NOTHING(18)
724 18: END(0)
725
726Cases where tail != last would be like /(?foo|bar)baz/:
727
728 1: BRANCH(4)
729 2: EXACT <foo>(8)
730 4: BRANCH(7)
731 5: EXACT <bar>(8)
732 7: TAIL(8)
733 8: EXACT <baz>(10)
734 10: END(0)
735
736which would be optimizable with startbranch=1, first=1, last=7, tail=8
737and would end up looking like:
738
739 1: TRIE(8)
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
741 <foo>
742 <bar>
743 7: TAIL(8)
744 8: EXACT <baz>(10)
745 10: END(0)
746
3dab1dad
YO
747 d = uvuni_to_utf8_flags(d, uv, 0);
748
749is the recommended Unicode-aware way of saying
750
751 *(d++) = uv;
a3621e74
YO
752*/
753
3dab1dad
YO
754#define TRIE_STORE_REVCHAR \
755 STMT_START { \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
758 } STMT_END
a3621e74
YO
759
760#define TRIE_READ_CHAR STMT_START { \
07be1b83 761 wordlen++; \
a3621e74
YO
762 if ( UTF ) { \
763 if ( folder ) { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
766 foldlen -= len; \
767 scan += len; \
768 len = 0; \
769 } else { \
e1ec3a88 770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
774 } \
775 } else { \
e1ec3a88 776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
777 } \
778 } else { \
779 uvc = (U32)*uc; \
780 len = 1; \
781 } \
782} STMT_END
783
784
785#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
789
790#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
795 } \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
799} STMT_END
800
801#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 802 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
806} STMT_END
807
07be1b83
YO
808#define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
811 if (trie->wordlen) \
812 trie->wordlen[ curword ] = wordlen; \
3dab1dad
YO
813 trie->states[ state ].wordnum = ++curword; \
814 DEBUG_r({ \
815 /* store the word for dumping */ \
816 SV* tmp; \
07be1b83
YO
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
3dab1dad 819 else \
07be1b83 820 tmp = newSVpvn( "", 0 ); \
3dab1dad
YO
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
823 }); \
824 } else { \
825 NOOP; /* It's a dupe. So ignore it. */ \
826 } \
827} STMT_END
828
829#ifdef DEBUGGING
07be1b83 830/*
3dab1dad
YO
831 dump_trie(trie)
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
834
835 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
839
3dab1dad
YO
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
07be1b83
YO
842
843*/
3dab1dad
YO
844
845/*
846 dump_trie(trie)
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
849*/
850
851STATIC void
852S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
853{
854 U32 state;
855 GET_RE_DEBUG_FLAGS_DECL;
856
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
860
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV **tmp = av_fetch( trie->revcharmap, state, 0);
863 if ( tmp ) {
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
873
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
904 } else {
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
906 }
907 }
908
909 PerlIO_printf( Perl_debug_log, "]");
910
911 }
912 PerlIO_printf( Perl_debug_log, "\n" );
913 }
914}
915/*
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
924{
925 U32 state;
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
931
932 for( state=1 ; state < next_alloc ; state ++ ) {
933 U16 charid;
934
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
939 } else {
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
942 );
943 }
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
950 );
951 }
952
953 }
954}
955
956/*
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
962*/
963STATIC void
964S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
965{
966 U32 state;
967 U16 charid;
968 GET_RE_DEBUG_FLAGS_DECL;
969
970 /*
971 print out the table precompression so that we can do a visual check
972 that they are identical.
973 */
974
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
976
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
979 if ( tmp ) {
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1001 }
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1004 } else {
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1007 }
1008 }
07be1b83 1009}
3dab1dad
YO
1010
1011#endif
1012
07be1b83
YO
1013#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1020 )
3dab1dad 1021
07be1b83
YO
1022STATIC void
1023S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1024{
1025/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1026
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1029 ISBN 0-201-10088-6
1030
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1036 Consider
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1041 */
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1044 U32 *q;
1045 U32 ucharcount = trie->uniquecharcount;
1046 U32 numstates = trie->laststate;
1047 U32 ubound = trie->lasttrans + ucharcount;
1048 U32 q_read = 0;
1049 U32 q_write = 0;
1050 U32 charid;
1051 U32 base = trie->states[ 1 ].trans.base;
1052 U32 newstate;
1053 U32 *fail;
1054 reg_ac_data *aho;
1055 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1056 GET_RE_DEBUG_FLAGS_DECL;
1057
1058 ARG_SET( stclass, data_slot );
1059 Newxz( aho, 1, reg_ac_data );
1060 RExC_rx->data->data[ data_slot ] = (void*)aho;
1061 aho->trie=trie;
1062 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1063 (trie->laststate+1)*sizeof(reg_trie_state));
1064 Newxz( q, numstates, U32);
1065 Newxz( aho->fail, numstates, U32 );
1066 fail= aho->fail;
1067 fail[ 0 ] = fail[ 1 ] = 1;
1068
1069 for ( charid = 0; charid < ucharcount ; charid++ ) {
1070 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1071 if ( newstate )
1072 {
1073 q[ q_write ] = newstate;
1074 /* set to point at the root */
1075 fail[ q[ q_write++ ] ]=1;
1076 }
1077 }
1078 while ( q_read < q_write) {
1079 U32 cur = q[ q_read++ % numstates ];
1080 U32 ch_state;
1081 base = trie->states[ cur ].trans.base;
1082
1083 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1084 if ( ( ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ) ) ) {
1085 U32 fail_state = cur;
1086 U32 fail_base;
1087 do {
1088 fail_state = fail[ fail_state ];
1089 fail_base = aho->states[ fail_state ].trans.base;
1090 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1091
1092 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1093 fail[ ch_state ] = fail_state;
1094 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1095 {
1096 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1097 }
1098 q[ q_write++ % numstates] = ch_state;
1099 }
1100 }
1101 }
1102
1103 DEBUG_TRIE_COMPILE_MORE_r({
1104 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1105 for( q_read=2; q_read<numstates; q_read++ ) {
1106 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1107 }
1108 PerlIO_printf(Perl_debug_log, "\n");
1109 });
1110 Safefree(q);
1111 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1112}
3dab1dad
YO
1113
1114
1115
a3621e74 1116STATIC I32
3dab1dad 1117S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1118{
27da23d5 1119 dVAR;
a3621e74
YO
1120 /* first pass, loop through and scan words */
1121 reg_trie_data *trie;
1122 regnode *cur;
9f7f3913 1123 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1124 STRLEN len = 0;
1125 UV uvc = 0;
1126 U16 curword = 0;
1127 U32 next_alloc = 0;
1128 /* we just use folder as a flag in utf8 */
e1ec3a88 1129 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1130 ? PL_fold
1131 : ( flags == EXACTFL
1132 ? PL_fold_locale
1133 : NULL
1134 )
1135 );
1136
e1ec3a88 1137 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1138 SV *re_trie_maxbuff;
3dab1dad
YO
1139#ifndef DEBUGGING
1140 /* these are only used during construction but are useful during
8e11feef
RGS
1141 * debugging so we store them in the struct when debugging.
1142 * Wordcount is actually superfluous in debugging as we have
1143 * (AV*)trie->words to use for it, but that's not available when
1144 * not debugging... We could make the macro use the AV during
1145 * debugging though...
1146 */
3dab1dad
YO
1147 U16 trie_wordcount=0;
1148 STRLEN trie_charcount=0;
07be1b83 1149 /*U32 trie_laststate=0;*/
3dab1dad
YO
1150 AV *trie_revcharmap;
1151#endif
a3621e74
YO
1152 GET_RE_DEBUG_FLAGS_DECL;
1153
a02a5408 1154 Newxz( trie, 1, reg_trie_data );
a3621e74 1155 trie->refcount = 1;
3dab1dad 1156 trie->startstate = 1;
a3621e74 1157 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1158 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1159 if (!(UTF && folder))
1160 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1161 DEBUG_r({
1162 trie->words = newAV();
a3621e74 1163 });
3dab1dad 1164 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1165
0111c4fd 1166 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1167 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1168 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1169 }
3dab1dad
YO
1170 DEBUG_OPTIMISE_r({
1171 PerlIO_printf( Perl_debug_log,
1172 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1173 (int)depth * 2 + 2, "",
1174 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1175 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1176 });
a3621e74
YO
1177 /* -- First loop and Setup --
1178
1179 We first traverse the branches and scan each word to determine if it
1180 contains widechars, and how many unique chars there are, this is
1181 important as we have to build a table with at least as many columns as we
1182 have unique chars.
1183
1184 We use an array of integers to represent the character codes 0..255
1185 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1186 native representation of the character value as the key and IV's for the
1187 coded index.
1188
1189 *TODO* If we keep track of how many times each character is used we can
1190 remap the columns so that the table compression later on is more
1191 efficient in terms of memory by ensuring most common value is in the
1192 middle and the least common are on the outside. IMO this would be better
1193 than a most to least common mapping as theres a decent chance the most
1194 common letter will share a node with the least common, meaning the node
1195 will not be compressable. With a middle is most common approach the worst
1196 case is when we have the least common nodes twice.
1197
1198 */
1199
a3621e74 1200 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1201 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1202 const U8 *uc = (U8*)STRING( noper );
a28509cc 1203 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1204 STRLEN foldlen = 0;
1205 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1206 const U8 *scan = (U8*)NULL;
07be1b83 1207 U32 wordlen = 0; /* required init */
3dab1dad 1208 STRLEN chars=0;
a3621e74 1209
3dab1dad
YO
1210 TRIE_WORDCOUNT(trie)++;
1211 if (OP(noper) == NOTHING) {
1212 trie->minlen= 0;
1213 continue;
1214 }
1215 if (trie->bitmap) {
1216 TRIE_BITMAP_SET(trie,*uc);
1217 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1218 }
a3621e74 1219 for ( ; uc < e ; uc += len ) {
3dab1dad 1220 TRIE_CHARCOUNT(trie)++;
a3621e74 1221 TRIE_READ_CHAR;
3dab1dad 1222 chars++;
a3621e74
YO
1223 if ( uvc < 256 ) {
1224 if ( !trie->charmap[ uvc ] ) {
1225 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1226 if ( folder )
1227 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1228 TRIE_STORE_REVCHAR;
a3621e74
YO
1229 }
1230 } else {
1231 SV** svpp;
1232 if ( !trie->widecharmap )
1233 trie->widecharmap = newHV();
1234
1235 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1236
1237 if ( !svpp )
e4584336 1238 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1239
1240 if ( !SvTRUE( *svpp ) ) {
1241 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1242 TRIE_STORE_REVCHAR;
a3621e74
YO
1243 }
1244 }
1245 }
3dab1dad
YO
1246 if( cur == first ) {
1247 trie->minlen=chars;
1248 trie->maxlen=chars;
1249 } else if (chars < trie->minlen) {
1250 trie->minlen=chars;
1251 } else if (chars > trie->maxlen) {
1252 trie->maxlen=chars;
1253 }
1254
a3621e74
YO
1255 } /* end first pass */
1256 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1257 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1258 (int)depth * 2 + 2,"",
1259 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1260 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
a3621e74 1261 );
07be1b83 1262 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
a3621e74
YO
1263
1264 /*
1265 We now know what we are dealing with in terms of unique chars and
1266 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1267 representation using a flat table will take. If it's over a reasonable
1268 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1269 conservative but potentially much slower representation using an array
1270 of lists.
1271
1272 At the end we convert both representations into the same compressed
1273 form that will be used in regexec.c for matching with. The latter
1274 is a form that cannot be used to construct with but has memory
1275 properties similar to the list form and access properties similar
1276 to the table form making it both suitable for fast searches and
1277 small enough that its feasable to store for the duration of a program.
1278
1279 See the comment in the code where the compressed table is produced
1280 inplace from the flat tabe representation for an explanation of how
1281 the compression works.
1282
1283 */
1284
1285
3dab1dad 1286 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1287 /*
1288 Second Pass -- Array Of Lists Representation
1289
1290 Each state will be represented by a list of charid:state records
1291 (reg_trie_trans_le) the first such element holds the CUR and LEN
1292 points of the allocated array. (See defines above).
1293
1294 We build the initial structure using the lists, and then convert
1295 it into the compressed table form which allows faster lookups
1296 (but cant be modified once converted).
a3621e74
YO
1297 */
1298
a3621e74
YO
1299 STRLEN transcount = 1;
1300
3dab1dad 1301 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1302 TRIE_LIST_NEW(1);
1303 next_alloc = 2;
1304
1305 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1306
c445ea15
AL
1307 regnode * const noper = NEXTOPER( cur );
1308 U8 *uc = (U8*)STRING( noper );
1309 const U8 * const e = uc + STR_LEN( noper );
1310 U32 state = 1; /* required init */
1311 U16 charid = 0; /* sanity init */
1312 U8 *scan = (U8*)NULL; /* sanity init */
1313 STRLEN foldlen = 0; /* required init */
07be1b83 1314 U32 wordlen = 0; /* required init */
c445ea15
AL
1315 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1316
3dab1dad 1317 if (OP(noper) != NOTHING) {
c445ea15
AL
1318 for ( ; uc < e ; uc += len ) {
1319
1320 TRIE_READ_CHAR;
1321
1322 if ( uvc < 256 ) {
1323 charid = trie->charmap[ uvc ];
1324 } else {
1325 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1326 if ( !svpp ) {
1327 charid = 0;
1328 } else {
1329 charid=(U16)SvIV( *svpp );
1330 }
1331 }
1332 if ( charid ) {
a3621e74 1333
c445ea15
AL
1334 U16 check;
1335 U32 newstate = 0;
a3621e74 1336
c445ea15
AL
1337 charid--;
1338 if ( !trie->states[ state ].trans.list ) {
1339 TRIE_LIST_NEW( state );
1340 }
1341 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1342 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1343 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1344 break;
1345 }
1346 }
1347 if ( ! newstate ) {
1348 newstate = next_alloc++;
1349 TRIE_LIST_PUSH( state, charid, newstate );
1350 transcount++;
1351 }
1352 state = newstate;
1353 } else {
1354 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1355 }
c445ea15
AL
1356 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1357 }
c445ea15 1358 }
3dab1dad 1359 TRIE_HANDLE_WORD(state);
a3621e74
YO
1360
1361 } /* end second pass */
1362
3dab1dad 1363 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1364 Renew( trie->states, next_alloc, reg_trie_state );
1365
3dab1dad
YO
1366 /* and now dump it out before we compress it */
1367 DEBUG_TRIE_COMPILE_MORE_r(
1368 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1369 );
a3621e74 1370
a02a5408 1371 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1372 {
1373 U32 state;
a3621e74
YO
1374 U32 tp = 0;
1375 U32 zp = 0;
1376
1377
1378 for( state=1 ; state < next_alloc ; state ++ ) {
1379 U32 base=0;
1380
1381 /*
1382 DEBUG_TRIE_COMPILE_MORE_r(
1383 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1384 );
1385 */
1386
1387 if (trie->states[state].trans.list) {
1388 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1389 U16 maxid=minid;
a28509cc 1390 U16 idx;
a3621e74
YO
1391
1392 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1393 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1394 if ( forid < minid ) {
1395 minid=forid;
1396 } else if ( forid > maxid ) {
1397 maxid=forid;
1398 }
a3621e74
YO
1399 }
1400 if ( transcount < tp + maxid - minid + 1) {
1401 transcount *= 2;
1402 Renew( trie->trans, transcount, reg_trie_trans );
1403 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1404 }
1405 base = trie->uniquecharcount + tp - minid;
1406 if ( maxid == minid ) {
1407 U32 set = 0;
1408 for ( ; zp < tp ; zp++ ) {
1409 if ( ! trie->trans[ zp ].next ) {
1410 base = trie->uniquecharcount + zp - minid;
1411 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1412 trie->trans[ zp ].check = state;
1413 set = 1;
1414 break;
1415 }
1416 }
1417 if ( !set ) {
1418 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ tp ].check = state;
1420 tp++;
1421 zp = tp;
1422 }
1423 } else {
1424 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1425 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1426 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1427 trie->trans[ tid ].check = state;
1428 }
1429 tp += ( maxid - minid + 1 );
1430 }
1431 Safefree(trie->states[ state ].trans.list);
1432 }
1433 /*
1434 DEBUG_TRIE_COMPILE_MORE_r(
1435 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1436 );
1437 */
1438 trie->states[ state ].trans.base=base;
1439 }
cc601c31 1440 trie->lasttrans = tp + 1;
a3621e74
YO
1441 }
1442 } else {
1443 /*
1444 Second Pass -- Flat Table Representation.
1445
1446 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1447 We know that we will need Charcount+1 trans at most to store the data
1448 (one row per char at worst case) So we preallocate both structures
1449 assuming worst case.
1450
1451 We then construct the trie using only the .next slots of the entry
1452 structs.
1453
1454 We use the .check field of the first entry of the node temporarily to
1455 make compression both faster and easier by keeping track of how many non
1456 zero fields are in the node.
1457
1458 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1459 transition.
1460
1461 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1462 number representing the first entry of the node, and state as a
1463 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1464 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1465 are 2 entrys per node. eg:
1466
1467 A B A B
1468 1. 2 4 1. 3 7
1469 2. 0 3 3. 0 5
1470 3. 0 0 5. 0 0
1471 4. 0 0 7. 0 0
1472
1473 The table is internally in the right hand, idx form. However as we also
1474 have to deal with the states array which is indexed by nodenum we have to
1475 use TRIE_NODENUM() to convert.
1476
1477 */
1478
3dab1dad
YO
1479
1480 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1481 reg_trie_trans );
3dab1dad 1482 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1483 next_alloc = trie->uniquecharcount + 1;
1484
3dab1dad 1485
a3621e74
YO
1486 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1487
c445ea15 1488 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1489 const U8 *uc = (U8*)STRING( noper );
1490 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1491
1492 U32 state = 1; /* required init */
1493
1494 U16 charid = 0; /* sanity init */
1495 U32 accept_state = 0; /* sanity init */
1496 U8 *scan = (U8*)NULL; /* sanity init */
1497
1498 STRLEN foldlen = 0; /* required init */
07be1b83 1499 U32 wordlen = 0; /* required init */
a3621e74
YO
1500 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1501
3dab1dad 1502 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1503 for ( ; uc < e ; uc += len ) {
1504
1505 TRIE_READ_CHAR;
1506
1507 if ( uvc < 256 ) {
1508 charid = trie->charmap[ uvc ];
1509 } else {
c445ea15
AL
1510 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1511 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1512 }
1513 if ( charid ) {
1514 charid--;
1515 if ( !trie->trans[ state + charid ].next ) {
1516 trie->trans[ state + charid ].next = next_alloc;
1517 trie->trans[ state ].check++;
1518 next_alloc += trie->uniquecharcount;
1519 }
1520 state = trie->trans[ state + charid ].next;
1521 } else {
e4584336 1522 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1523 }
1524 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1525 }
a3621e74 1526 }
3dab1dad
YO
1527 accept_state = TRIE_NODENUM( state );
1528 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1529
1530 } /* end second pass */
1531
3dab1dad
YO
1532 /* and now dump it out before we compress it */
1533 DEBUG_TRIE_COMPILE_MORE_r(
1534 dump_trie_interim_table(trie,next_alloc,depth+1)
1535 );
a3621e74 1536
a3621e74
YO
1537 {
1538 /*
1539 * Inplace compress the table.*
1540
1541 For sparse data sets the table constructed by the trie algorithm will
1542 be mostly 0/FAIL transitions or to put it another way mostly empty.
1543 (Note that leaf nodes will not contain any transitions.)
1544
1545 This algorithm compresses the tables by eliminating most such
1546 transitions, at the cost of a modest bit of extra work during lookup:
1547
1548 - Each states[] entry contains a .base field which indicates the
1549 index in the state[] array wheres its transition data is stored.
1550
1551 - If .base is 0 there are no valid transitions from that node.
1552
1553 - If .base is nonzero then charid is added to it to find an entry in
1554 the trans array.
1555
1556 -If trans[states[state].base+charid].check!=state then the
1557 transition is taken to be a 0/Fail transition. Thus if there are fail
1558 transitions at the front of the node then the .base offset will point
1559 somewhere inside the previous nodes data (or maybe even into a node
1560 even earlier), but the .check field determines if the transition is
1561 valid.
1562
1563 The following process inplace converts the table to the compressed
1564 table: We first do not compress the root node 1,and mark its all its
1565 .check pointers as 1 and set its .base pointer as 1 as well. This
1566 allows to do a DFA construction from the compressed table later, and
1567 ensures that any .base pointers we calculate later are greater than
1568 0.
1569
1570 - We set 'pos' to indicate the first entry of the second node.
1571
1572 - We then iterate over the columns of the node, finding the first and
1573 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1574 and set the .check pointers accordingly, and advance pos
1575 appropriately and repreat for the next node. Note that when we copy
1576 the next pointers we have to convert them from the original
1577 NODEIDX form to NODENUM form as the former is not valid post
1578 compression.
1579
1580 - If a node has no transitions used we mark its base as 0 and do not
1581 advance the pos pointer.
1582
1583 - If a node only has one transition we use a second pointer into the
1584 structure to fill in allocated fail transitions from other states.
1585 This pointer is independent of the main pointer and scans forward
1586 looking for null transitions that are allocated to a state. When it
1587 finds one it writes the single transition into the "hole". If the
1588 pointer doesnt find one the single transition is appeneded as normal.
1589
1590 - Once compressed we can Renew/realloc the structures to release the
1591 excess space.
1592
1593 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1594 specifically Fig 3.47 and the associated pseudocode.
1595
1596 demq
1597 */
a3b680e6 1598 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1599 U32 state, charid;
a3621e74 1600 U32 pos = 0, zp=0;
3dab1dad 1601 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1602
1603 for ( state = 1 ; state < laststate ; state++ ) {
1604 U8 flag = 0;
a28509cc
AL
1605 const U32 stateidx = TRIE_NODEIDX( state );
1606 const U32 o_used = trie->trans[ stateidx ].check;
1607 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1608 trie->trans[ stateidx ].check = 0;
1609
1610 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1611 if ( flag || trie->trans[ stateidx + charid ].next ) {
1612 if ( trie->trans[ stateidx + charid ].next ) {
1613 if (o_used == 1) {
1614 for ( ; zp < pos ; zp++ ) {
1615 if ( ! trie->trans[ zp ].next ) {
1616 break;
1617 }
1618 }
1619 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1620 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1621 trie->trans[ zp ].check = state;
1622 if ( ++zp > pos ) pos = zp;
1623 break;
1624 }
1625 used--;
1626 }
1627 if ( !flag ) {
1628 flag = 1;
1629 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1630 }
1631 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1632 trie->trans[ pos ].check = state;
1633 pos++;
1634 }
1635 }
1636 }
cc601c31 1637 trie->lasttrans = pos + 1;
a3621e74
YO
1638 Renew( trie->states, laststate + 1, reg_trie_state);
1639 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1640 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1641 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1642 (int)depth * 2 + 2,"",
1643 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1644 (IV)next_alloc,
1645 (IV)pos,
a3621e74
YO
1646 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1647 );
1648
1649 } /* end table compress */
1650 }
cc601c31
YO
1651 /* resize the trans array to remove unused space */
1652 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1653
3dab1dad
YO
1654 /* and now dump out the compressed format */
1655 DEBUG_TRIE_COMPILE_r(
1656 dump_trie(trie,depth+1)
1657 );
07be1b83 1658
3dab1dad
YO
1659 { /* Modify the program and insert the new TRIE node*/
1660 regnode *convert;
1661 U8 nodetype =(U8)(flags & 0xFF);
1662 char *str=NULL;
07be1b83
YO
1663#ifdef DEBUGGING
1664 U32 mjd_offset;
1665 U32 mjd_nodelen;
1666#endif
a3621e74 1667 /*
3dab1dad
YO
1668 This means we convert either the first branch or the first Exact,
1669 depending on whether the thing following (in 'last') is a branch
1670 or not and whther first is the startbranch (ie is it a sub part of
1671 the alternation or is it the whole thing.)
1672 Assuming its a sub part we conver the EXACT otherwise we convert
1673 the whole branch sequence, including the first.
a3621e74 1674 */
3dab1dad
YO
1675 /* Find the node we are going to overwrite */
1676 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1677 /* whole branch chain */
3dab1dad 1678 convert = first;
07be1b83
YO
1679 DEBUG_r({
1680 const regnode *nop = NEXTOPER( convert );
1681 mjd_offset= Node_Offset((nop));
1682 mjd_nodelen= Node_Length((nop));
1683 });
1684 } else {
1685 /* branch sub-chain */
3dab1dad
YO
1686 convert = NEXTOPER( first );
1687 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1688 DEBUG_r({
1689 mjd_offset= Node_Offset((convert));
1690 mjd_nodelen= Node_Length((convert));
1691 });
1692 }
1693 DEBUG_OPTIMISE_r(
1694 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1695 (int)depth * 2 + 2, "",
1696 mjd_offset,mjd_nodelen)
1697 );
a3621e74 1698
3dab1dad
YO
1699 /* But first we check to see if there is a common prefix we can
1700 split out as an EXACT and put in front of the TRIE node. */
1701 trie->startstate= 1;
1702 if ( trie->bitmap && !trie->widecharmap ) {
1703 U32 state;
1704 DEBUG_OPTIMISE_r(
8e11feef
RGS
1705 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1706 (int)depth * 2 + 2, "",
1707 TRIE_LASTSTATE(trie))
1708 );
1709 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1710 U32 ofs = 0;
8e11feef
RGS
1711 I32 idx = -1;
1712 U32 count = 0;
1713 const U32 base = trie->states[ state ].trans.base;
a3621e74 1714
3dab1dad 1715 if ( trie->states[state].wordnum )
8e11feef 1716 count = 1;
a3621e74 1717
8e11feef 1718 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1719 if ( ( base + ofs >= trie->uniquecharcount ) &&
1720 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1721 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1722 {
3dab1dad 1723 if ( ++count > 1 ) {
8e11feef 1724 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1725 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1726 if ( state == 1 ) break;
3dab1dad
YO
1727 if ( count == 2 ) {
1728 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1729 DEBUG_OPTIMISE_r(
8e11feef
RGS
1730 PerlIO_printf(Perl_debug_log,
1731 "%*sNew Start State=%"UVuf" Class: [",
1732 (int)depth * 2 + 2, "",
3dab1dad
YO
1733 state));
1734 if (idx>-1) {
8e11feef 1735 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
07be1b83 1736 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1737
3dab1dad 1738 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1739 if ( folder )
1740 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1741 DEBUG_OPTIMISE_r(
07be1b83 1742 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1743 );
8e11feef
RGS
1744 }
1745 }
1746 TRIE_BITMAP_SET(trie,*ch);
1747 if ( folder )
1748 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1750 }
1751 idx = ofs;
1752 }
3dab1dad
YO
1753 }
1754 if ( count == 1 ) {
1755 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1756 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1757 DEBUG_OPTIMISE_r(
8e11feef
RGS
1758 PerlIO_printf( Perl_debug_log,
1759 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1760 (int)depth * 2 + 2, "",
1761 state, idx, ch)
3dab1dad
YO
1762 );
1763 if ( state==1 ) {
1764 OP( convert ) = nodetype;
1765 str=STRING(convert);
1766 STR_LEN(convert)=0;
1767 }
1768 *str++=*ch;
1769 STR_LEN(convert)++;
a3621e74 1770
8e11feef
RGS
1771 } else {
1772 if (state>1)
1773 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1774 break;
1775 }
1776 }
3dab1dad 1777 if (str) {
8e11feef 1778 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1779 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1780 trie->startstate = state;
07be1b83
YO
1781 trie->minlen -= (state - 1);
1782 trie->maxlen -= (state - 1);
1783 DEBUG_r({
1784 regnode *fix = convert;
1785 mjd_nodelen++;
1786 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1787 while( ++fix < n ) {
1788 Set_Node_Offset_Length(fix, 0, 0);
1789 }
1790 });
8e11feef
RGS
1791 if (trie->maxlen) {
1792 convert = n;
1793 } else {
3dab1dad
YO
1794 NEXT_OFF(convert) = (U16)(tail - convert);
1795 }
1796 }
1797 }
1798 if ( trie->maxlen ) {
1799 OP( convert ) = TRIE;
8e11feef
RGS
1800 NEXT_OFF( convert ) = (U16)(tail - convert);
1801 ARG_SET( convert, data_slot );
a3621e74 1802
3dab1dad
YO
1803 /* store the type in the flags */
1804 convert->flags = nodetype;
1805 /* XXX We really should free up the resource in trie now, as we wont use them */
1806 }
a3621e74
YO
1807 /* needed for dumping*/
1808 DEBUG_r({
1809 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
07be1b83
YO
1810 regnode *opt = convert;
1811 while (++opt<optimize) {
1812 Set_Node_Offset_Length(opt,0,0);
1813 }
a3621e74
YO
1814 /* We now need to mark all of the space originally used by the
1815 branches as optimized away. This keeps the dumpuntil from
1816 throwing a wobbly as it doesnt use regnext() to traverse the
1817 opcodes.
07be1b83 1818 We also "fix" the offsets
a3621e74
YO
1819 */
1820 while( optimize < last ) {
07be1b83 1821 mjd_nodelen += Node_Length((optimize));
a3621e74 1822 OP( optimize ) = OPTIMIZED;
07be1b83 1823 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1824 optimize++;
1825 }
07be1b83 1826 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1827 });
1828 } /* end node insert */
07be1b83 1829#ifndef DEBUGGING
6e8b4190 1830 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1831#endif
a3621e74
YO
1832 return 1;
1833}
1834
a3621e74 1835/*
5d1c421c
JH
1836 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1837 * These need to be revisited when a newer toolchain becomes available.
1838 */
1839#if defined(__sparc64__) && defined(__GNUC__)
1840# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1841# undef SPARC64_GCC_WORKAROUND
1842# define SPARC64_GCC_WORKAROUND 1
1843# endif
1844#endif
1845
07be1b83
YO
1846#define DEBUG_PEEP(str,scan,depth) \
1847 DEBUG_OPTIMISE_r({ \
1848 SV * const mysv=sv_newmortal(); \
1849 regnode *Next = regnext(scan); \
1850 regprop(RExC_rx, mysv, scan); \
1851 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1852 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1853 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1854 });
1855
1856#define JOIN_EXACT(scan,min,flags) \
1857 if (PL_regkind[OP(scan)] == EXACT) \
1858 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1859
1860U32
1861S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1862 /* Merge several consecutive EXACTish nodes into one. */
1863 regnode *n = regnext(scan);
1864 U32 stringok = 1;
1865 regnode *next = scan + NODE_SZ_STR(scan);
1866 U32 merged = 0;
1867 U32 stopnow = 0;
1868#ifdef DEBUGGING
1869 regnode *stop = scan;
1870#endif
1871 GET_RE_DEBUG_FLAGS_DECL;
1872 DEBUG_PEEP("join",scan,depth);
1873
1874 /* Skip NOTHING, merge EXACT*. */
1875 while (n &&
1876 ( PL_regkind[OP(n)] == NOTHING ||
1877 (stringok && (OP(n) == OP(scan))))
1878 && NEXT_OFF(n)
1879 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1880
1881 if (OP(n) == TAIL || n > next)
1882 stringok = 0;
1883 if (PL_regkind[OP(n)] == NOTHING) {
1884
1885 DEBUG_PEEP("skip:",n,depth);
1886 NEXT_OFF(scan) += NEXT_OFF(n);
1887 next = n + NODE_STEP_REGNODE;
1888#ifdef DEBUGGING
1889 if (stringok)
1890 stop = n;
1891#endif
1892 n = regnext(n);
1893 }
1894 else if (stringok) {
1895 const int oldl = STR_LEN(scan);
1896 regnode * const nnext = regnext(n);
1897
1898 DEBUG_PEEP("merg",n,depth);
1899
1900 merged++;
1901 if (oldl + STR_LEN(n) > U8_MAX)
1902 break;
1903 NEXT_OFF(scan) += NEXT_OFF(n);
1904 STR_LEN(scan) += STR_LEN(n);
1905 next = n + NODE_SZ_STR(n);
1906 /* Now we can overwrite *n : */
1907 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1908#ifdef DEBUGGING
1909 stop = next - 1;
1910#endif
1911 n = nnext;
1912 if (stopnow) break;
1913 }
1914
1915#ifdef EXPERIMENTAL_INPLACESCAN
1916 if (flags && !NEXT_OFF(n)) {
1917 DEBUG_PEEP("atch",val,depth);
1918 if (reg_off_by_arg[OP(n)]) {
1919 ARG_SET(n, val - n);
1920 }
1921 else {
1922 NEXT_OFF(n) = val - n;
1923 }
1924 stopnow=1;
1925 }
1926#endif
1927 }
1928
1929 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1930 /*
1931 Two problematic code points in Unicode casefolding of EXACT nodes:
1932
1933 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1934 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1935
1936 which casefold to
1937
1938 Unicode UTF-8
1939
1940 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1941 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1942
1943 This means that in case-insensitive matching (or "loose matching",
1944 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1945 length of the above casefolded versions) can match a target string
1946 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1947 This would rather mess up the minimum length computation.
1948
1949 What we'll do is to look for the tail four bytes, and then peek
1950 at the preceding two bytes to see whether we need to decrease
1951 the minimum length by four (six minus two).
1952
1953 Thanks to the design of UTF-8, there cannot be false matches:
1954 A sequence of valid UTF-8 bytes cannot be a subsequence of
1955 another valid sequence of UTF-8 bytes.
1956
1957 */
1958 char * const s0 = STRING(scan), *s, *t;
1959 char * const s1 = s0 + STR_LEN(scan) - 1;
1960 char * const s2 = s1 - 4;
1961 const char t0[] = "\xcc\x88\xcc\x81";
1962 const char * const t1 = t0 + 3;
1963
1964 for (s = s0 + 2;
1965 s < s2 && (t = ninstr(s, s1, t0, t1));
1966 s = t + 4) {
1967 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1968 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1969 *min -= 4;
1970 }
1971 }
1972
1973#ifdef DEBUGGING
1974 /* Allow dumping */
1975 n = scan + NODE_SZ_STR(scan);
1976 while (n <= stop) {
1977 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1978 OP(n) = OPTIMIZED;
1979 NEXT_OFF(n) = 0;
1980 }
1981 n++;
1982 }
1983#endif
1984 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1985 return stopnow;
1986}
1987
653099ff
GS
1988/* REx optimizer. Converts nodes into quickier variants "in place".
1989 Finds fixed substrings. */
1990
a0288114 1991/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1992 to the position after last scanned or to NULL. */
1993
07be1b83
YO
1994
1995
76e3520e 1996STATIC I32
9a957fbc
AL
1997S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1998 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1999 /* scanp: Start here (read-write). */
2000 /* deltap: Write maxlen-minlen here. */
2001 /* last: Stop before this one. */
2002{
97aff369 2003 dVAR;
c277df42
IZ
2004 I32 min = 0, pars = 0, code;
2005 regnode *scan = *scanp, *next;
2006 I32 delta = 0;
2007 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2008 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2009 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2010 scan_data_t data_fake;
653099ff 2011 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
2012 SV *re_trie_maxbuff = NULL;
2013
2014 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 2015
c277df42
IZ
2016 while (scan && OP(scan) != END && scan < last) {
2017 /* Peephole optimizer: */
07be1b83 2018 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2019
07be1b83 2020 JOIN_EXACT(scan,&min,0);
a3621e74 2021
653099ff
GS
2022 /* Follow the next-chain of the current node and optimize
2023 away all the NOTHINGs from it. */
c277df42 2024 if (OP(scan) != CURLYX) {
a3b680e6 2025 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2026 ? I32_MAX
2027 /* I32 may be smaller than U16 on CRAYs! */
2028 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2029 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2030 int noff;
2031 regnode *n = scan;
b81d288d 2032
c277df42
IZ
2033 /* Skip NOTHING and LONGJMP. */
2034 while ((n = regnext(n))
3dab1dad 2035 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2036 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2037 && off + noff < max)
2038 off += noff;
2039 if (reg_off_by_arg[OP(scan)])
2040 ARG(scan) = off;
b81d288d 2041 else
c277df42
IZ
2042 NEXT_OFF(scan) = off;
2043 }
a3621e74 2044
07be1b83 2045
3dab1dad 2046
653099ff
GS
2047 /* The principal pseudo-switch. Cannot be a switch, since we
2048 look into several different things. */
b81d288d 2049 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2050 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2051 next = regnext(scan);
2052 code = OP(scan);
a3621e74 2053 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2054
2055 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 2056 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2057 struct regnode_charclass_class accum;
d4c19fe8 2058 regnode * const startbranch=scan;
c277df42 2059
653099ff 2060 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2061 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2062 if (flags & SCF_DO_STCLASS)
830247a4 2063 cl_init_zero(pRExC_state, &accum);
a3621e74 2064
c277df42 2065 while (OP(scan) == code) {
830247a4 2066 I32 deltanext, minnext, f = 0, fake;
653099ff 2067 struct regnode_charclass_class this_class;
c277df42
IZ
2068
2069 num++;
2070 data_fake.flags = 0;
b81d288d 2071 if (data) {
2c2d71f5 2072 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2073 data_fake.last_closep = data->last_closep;
2074 }
2075 else
2076 data_fake.last_closep = &fake;
c277df42
IZ
2077 next = regnext(scan);
2078 scan = NEXTOPER(scan);
2079 if (code != BRANCH)
2080 scan = NEXTOPER(scan);
653099ff 2081 if (flags & SCF_DO_STCLASS) {
830247a4 2082 cl_init(pRExC_state, &this_class);
653099ff
GS
2083 data_fake.start_class = &this_class;
2084 f = SCF_DO_STCLASS_AND;
b81d288d 2085 }
e1901655
IZ
2086 if (flags & SCF_WHILEM_VISITED_POS)
2087 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2088
653099ff 2089 /* we suppose the run is continuous, last=next...*/
830247a4 2090 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2091 next, &data_fake, f,depth+1);
b81d288d 2092 if (min1 > minnext)
c277df42
IZ
2093 min1 = minnext;
2094 if (max1 < minnext + deltanext)
2095 max1 = minnext + deltanext;
2096 if (deltanext == I32_MAX)
aca2d497 2097 is_inf = is_inf_internal = 1;
c277df42
IZ
2098 scan = next;
2099 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2100 pars++;
3dab1dad
YO
2101 if (data) {
2102 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2103 data->flags |= SF_HAS_EVAL;
2c2d71f5 2104 data->whilem_c = data_fake.whilem_c;
3dab1dad 2105 }
653099ff 2106 if (flags & SCF_DO_STCLASS)
830247a4 2107 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2108 if (code == SUSPEND)
c277df42
IZ
2109 break;
2110 }
2111 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2112 min1 = 0;
2113 if (flags & SCF_DO_SUBSTR) {
2114 data->pos_min += min1;
2115 data->pos_delta += max1 - min1;
2116 if (max1 != min1 || is_inf)
2117 data->longest = &(data->longest_float);
2118 }
2119 min += min1;
2120 delta += max1 - min1;
653099ff 2121 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2122 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2123 if (min1) {
2124 cl_and(data->start_class, &and_with);
2125 flags &= ~SCF_DO_STCLASS;
2126 }
2127 }
2128 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2129 if (min1) {
2130 cl_and(data->start_class, &accum);
653099ff 2131 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2132 }
2133 else {
b81d288d 2134 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2135 * data->start_class */
2136 StructCopy(data->start_class, &and_with,
2137 struct regnode_charclass_class);
2138 flags &= ~SCF_DO_STCLASS_AND;
2139 StructCopy(&accum, data->start_class,
2140 struct regnode_charclass_class);
2141 flags |= SCF_DO_STCLASS_OR;
2142 data->start_class->flags |= ANYOF_EOS;
2143 }
653099ff 2144 }
a3621e74
YO
2145
2146 /* demq.
2147
2148 Assuming this was/is a branch we are dealing with: 'scan' now
2149 points at the item that follows the branch sequence, whatever
2150 it is. We now start at the beginning of the sequence and look
2151 for subsequences of
2152
2153 BRANCH->EXACT=>X
2154 BRANCH->EXACT=>X
2155
2156 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2157
2158 If we can find such a subseqence we need to turn the first
2159 element into a trie and then add the subsequent branch exact
2160 strings to the trie.
2161
2162 We have two cases
2163
2164 1. patterns where the whole set of branch can be converted to a trie,
2165
2166 2. patterns where only a subset of the alternations can be
2167 converted to a trie.
2168
2169 In case 1 we can replace the whole set with a single regop
2170 for the trie. In case 2 we need to keep the start and end
2171 branchs so
2172
2173 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2174 becomes BRANCH TRIE; BRANCH X;
2175
2176 Hypthetically when we know the regex isnt anchored we can
2177 turn a case 1 into a DFA and let it rip... Every time it finds a match
2178 it would just call its tail, no WHILEM/CURLY needed.
2179
2180 */
07be1b83 2181 if (PERL_ENABLE_TRIE_OPTIMISATION) {
3dab1dad 2182 int made=0;
0111c4fd
RGS
2183 if (!re_trie_maxbuff) {
2184 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2185 if (!SvIOK(re_trie_maxbuff))
2186 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2187 }
a3621e74
YO
2188 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2189 regnode *cur;
2190 regnode *first = (regnode *)NULL;
2191 regnode *last = (regnode *)NULL;
2192 regnode *tail = scan;
2193 U8 optype = 0;
2194 U32 count=0;
2195
2196#ifdef DEBUGGING
c445ea15 2197 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2198#endif
2199 /* var tail is used because there may be a TAIL
2200 regop in the way. Ie, the exacts will point to the
2201 thing following the TAIL, but the last branch will
2202 point at the TAIL. So we advance tail. If we
2203 have nested (?:) we may have to move through several
2204 tails.
2205 */
2206
2207 while ( OP( tail ) == TAIL ) {
2208 /* this is the TAIL generated by (?:) */
2209 tail = regnext( tail );
2210 }
2211
3dab1dad 2212
a3621e74 2213 DEBUG_OPTIMISE_r({
32fc9b6a 2214 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2215 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2216 (int)depth * 2 + 2, "",
2217 "Looking for TRIE'able sequences. Tail node is: ",
2218 SvPV_nolen_const( mysv )
a3621e74
YO
2219 );
2220 });
3dab1dad 2221
a3621e74
YO
2222 /*
2223
2224 step through the branches, cur represents each
2225 branch, noper is the first thing to be matched
2226 as part of that branch and noper_next is the
2227 regnext() of that node. if noper is an EXACT
2228 and noper_next is the same as scan (our current
2229 position in the regex) then the EXACT branch is
2230 a possible optimization target. Once we have
2231 two or more consequetive such branches we can
2232 create a trie of the EXACT's contents and stich
2233 it in place. If the sequence represents all of
2234 the branches we eliminate the whole thing and
2235 replace it with a single TRIE. If it is a
2236 subsequence then we need to stitch it in. This
2237 means the first branch has to remain, and needs
2238 to be repointed at the item on the branch chain
2239 following the last branch optimized. This could
2240 be either a BRANCH, in which case the
2241 subsequence is internal, or it could be the
2242 item following the branch sequence in which
2243 case the subsequence is at the end.
2244
2245 */
2246
2247 /* dont use tail as the end marker for this traverse */
2248 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2249 regnode * const noper = NEXTOPER( cur );
2250 regnode * const noper_next = regnext( noper );
a3621e74 2251
a3621e74 2252 DEBUG_OPTIMISE_r({
32fc9b6a 2253 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2254 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2255 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2256
32fc9b6a 2257 regprop(RExC_rx, mysv, noper);
a3621e74 2258 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2259 SvPV_nolen_const(mysv));
a3621e74
YO
2260
2261 if ( noper_next ) {
32fc9b6a 2262 regprop(RExC_rx, mysv, noper_next );
a3621e74 2263 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2264 SvPV_nolen_const(mysv));
a3621e74 2265 }
3dab1dad
YO
2266 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2267 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2268 });
3dab1dad
YO
2269 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2270 : PL_regkind[ OP( noper ) ] == EXACT )
2271 || OP(noper) == NOTHING )
a3621e74
YO
2272 && noper_next == tail && count<U16_MAX)
2273 {
2274 count++;
3dab1dad
YO
2275 if ( !first || optype == NOTHING ) {
2276 if (!first) first = cur;
a3621e74
YO
2277 optype = OP( noper );
2278 } else {
a3621e74 2279 last = cur;
a3621e74
YO
2280 }
2281 } else {
2282 if ( last ) {
3dab1dad 2283 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2284 }
3dab1dad 2285 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2286 && noper_next == tail )
2287 {
2288 count = 1;
2289 first = cur;
2290 optype = OP( noper );
2291 } else {
2292 count = 0;
2293 first = NULL;
2294 optype = 0;
2295 }
2296 last = NULL;
2297 }
2298 }
2299 DEBUG_OPTIMISE_r({
32fc9b6a 2300 regprop(RExC_rx, mysv, cur);
a3621e74 2301 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2302 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2303 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2304
2305 });
2306 if ( last ) {
3dab1dad
YO
2307 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2308#ifdef TRIE_STUDY_OPT
07be1b83
YO
2309 if ( made && startbranch == first ) {
2310 if ( OP(first)!=TRIE )
2311 flags |= SCF_EXACT_TRIE;
2312 else {
2313 regnode *chk=*scanp;
2314 while ( OP( chk ) == OPEN )
2315 chk = regnext( chk );
2316 if (chk==first)
2317 flags |= SCF_EXACT_TRIE;
2318 }
2319 }
3dab1dad 2320#endif
07be1b83 2321 }
a3621e74 2322 }
3dab1dad
YO
2323
2324 } /* do trie */
a0ed51b3 2325 }
a3621e74 2326 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2327 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2328 } else /* single branch is optimized. */
c277df42
IZ
2329 scan = NEXTOPER(scan);
2330 continue;
a0ed51b3
LW
2331 }
2332 else if (OP(scan) == EXACT) {
cd439c50 2333 I32 l = STR_LEN(scan);
c445ea15 2334 UV uc;
a0ed51b3 2335 if (UTF) {
a3b680e6 2336 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2337 l = utf8_length(s, s + l);
9041c2e3 2338 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2339 } else {
2340 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2341 }
2342 min += l;
c277df42 2343 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2344 /* The code below prefers earlier match for fixed
2345 offset, later match for variable offset. */
2346 if (data->last_end == -1) { /* Update the start info. */
2347 data->last_start_min = data->pos_min;
2348 data->last_start_max = is_inf
b81d288d 2349 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2350 }
cd439c50 2351 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2352 if (UTF)
2353 SvUTF8_on(data->last_found);
0eda9292 2354 {
9a957fbc 2355 SV * const sv = data->last_found;
a28509cc 2356 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2357 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2358 if (mg && mg->mg_len >= 0)
5e43f467
JH
2359 mg->mg_len += utf8_length((U8*)STRING(scan),
2360 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2361 }
c277df42
IZ
2362 data->last_end = data->pos_min + l;
2363 data->pos_min += l; /* As in the first entry. */
2364 data->flags &= ~SF_BEFORE_EOL;
2365 }
653099ff
GS
2366 if (flags & SCF_DO_STCLASS_AND) {
2367 /* Check whether it is compatible with what we know already! */
2368 int compat = 1;
2369
1aa99e6b 2370 if (uc >= 0x100 ||
516a5887 2371 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2372 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2373 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2374 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2375 )
653099ff
GS
2376 compat = 0;
2377 ANYOF_CLASS_ZERO(data->start_class);
2378 ANYOF_BITMAP_ZERO(data->start_class);
2379 if (compat)
1aa99e6b 2380 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2381 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2382 if (uc < 0x100)
2383 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2384 }
2385 else if (flags & SCF_DO_STCLASS_OR) {
2386 /* false positive possible if the class is case-folded */
1aa99e6b 2387 if (uc < 0x100)
9b877dbb
IH
2388 ANYOF_BITMAP_SET(data->start_class, uc);
2389 else
2390 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2391 data->start_class->flags &= ~ANYOF_EOS;
2392 cl_and(data->start_class, &and_with);
2393 }
2394 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2395 }
3dab1dad 2396 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2397 I32 l = STR_LEN(scan);
1aa99e6b 2398 UV uc = *((U8*)STRING(scan));
653099ff
GS
2399
2400 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2401 if (flags & SCF_DO_SUBSTR) {
2402 assert(data);
830247a4 2403 scan_commit(pRExC_state, data);
ecaa9b9c 2404 }
a0ed51b3 2405 if (UTF) {
6136c704 2406 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2407 l = utf8_length(s, s + l);
9041c2e3 2408 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2409 }
2410 min += l;
ecaa9b9c 2411 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2412 data->pos_min += l;
653099ff
GS
2413 if (flags & SCF_DO_STCLASS_AND) {
2414 /* Check whether it is compatible with what we know already! */
2415 int compat = 1;
2416
1aa99e6b 2417 if (uc >= 0x100 ||
516a5887 2418 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2419 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2420 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2421 compat = 0;
2422 ANYOF_CLASS_ZERO(data->start_class);
2423 ANYOF_BITMAP_ZERO(data->start_class);
2424 if (compat) {
1aa99e6b 2425 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2426 data->start_class->flags &= ~ANYOF_EOS;
2427 data->start_class->flags |= ANYOF_FOLD;
2428 if (OP(scan) == EXACTFL)
2429 data->start_class->flags |= ANYOF_LOCALE;
2430 }
2431 }
2432 else if (flags & SCF_DO_STCLASS_OR) {
2433 if (data->start_class->flags & ANYOF_FOLD) {
2434 /* false positive possible if the class is case-folded.
2435 Assume that the locale settings are the same... */
1aa99e6b
IH
2436 if (uc < 0x100)
2437 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2438 data->start_class->flags &= ~ANYOF_EOS;
2439 }
2440 cl_and(data->start_class, &and_with);
2441 }
2442 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2443 }
3dab1dad
YO
2444#ifdef TRIE_STUDY_OPT
2445 else if (OP(scan) == TRIE) {
2446 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2447 min += trie->minlen;
07be1b83 2448 delta += (trie->maxlen - trie->minlen);
3dab1dad
YO
2449 flags &= ~SCF_DO_STCLASS; /* xxx */
2450 if (flags & SCF_DO_SUBSTR) {
2451 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2452 data->pos_min += trie->minlen;
07be1b83
YO
2453 data->pos_delta += (trie->maxlen - trie->minlen);
2454 if (trie->maxlen != trie->minlen)
2455 data->longest = &(data->longest_float);
3dab1dad
YO
2456 }
2457 }
2458#endif
bfed75c6 2459 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2460 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2461 I32 f = flags, pos_before = 0;
d4c19fe8 2462 regnode * const oscan = scan;
653099ff
GS
2463 struct regnode_charclass_class this_class;
2464 struct regnode_charclass_class *oclass = NULL;
727f22e3 2465 I32 next_is_eval = 0;
653099ff 2466
3dab1dad 2467 switch (PL_regkind[OP(scan)]) {
653099ff 2468 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2469 scan = NEXTOPER(scan);
2470 goto finish;
2471 case PLUS:
653099ff 2472 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2473 next = NEXTOPER(scan);
653099ff 2474 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2475 mincount = 1;
2476 maxcount = REG_INFTY;
c277df42
IZ
2477 next = regnext(scan);
2478 scan = NEXTOPER(scan);
2479 goto do_curly;
2480 }
2481 }
2482 if (flags & SCF_DO_SUBSTR)
2483 data->pos_min++;
2484 min++;
2485 /* Fall through. */
2486 case STAR:
653099ff
GS
2487 if (flags & SCF_DO_STCLASS) {
2488 mincount = 0;
b81d288d 2489 maxcount = REG_INFTY;
653099ff
GS
2490 next = regnext(scan);
2491 scan = NEXTOPER(scan);
2492 goto do_curly;
2493 }
b81d288d 2494 is_inf = is_inf_internal = 1;
c277df42
IZ
2495 scan = regnext(scan);
2496 if (flags & SCF_DO_SUBSTR) {
830247a4 2497 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2498 data->longest = &(data->longest_float);
2499 }
2500 goto optimize_curly_tail;
2501 case CURLY:
b81d288d 2502 mincount = ARG1(scan);
c277df42
IZ
2503 maxcount = ARG2(scan);
2504 next = regnext(scan);
cb434fcc
IZ
2505 if (OP(scan) == CURLYX) {
2506 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2507 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2508 }
c277df42 2509 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2510 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2511 do_curly:
2512 if (flags & SCF_DO_SUBSTR) {
830247a4 2513 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2514 pos_before = data->pos_min;
2515 }
2516 if (data) {
2517 fl = data->flags;
2518 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2519 if (is_inf)
2520 data->flags |= SF_IS_INF;
2521 }
653099ff 2522 if (flags & SCF_DO_STCLASS) {
830247a4 2523 cl_init(pRExC_state, &this_class);
653099ff
GS
2524 oclass = data->start_class;
2525 data->start_class = &this_class;
2526 f |= SCF_DO_STCLASS_AND;
2527 f &= ~SCF_DO_STCLASS_OR;
2528 }
e1901655
IZ
2529 /* These are the cases when once a subexpression
2530 fails at a particular position, it cannot succeed
2531 even after backtracking at the enclosing scope.
b81d288d 2532
e1901655
IZ
2533 XXXX what if minimal match and we are at the
2534 initial run of {n,m}? */
2535 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2536 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2537
c277df42 2538 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2539 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2540 (mincount == 0
2541 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2542
2543 if (flags & SCF_DO_STCLASS)
2544 data->start_class = oclass;
2545 if (mincount == 0 || minnext == 0) {
2546 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2547 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2548 }
2549 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2550 /* Switch to OR mode: cache the old value of
653099ff
GS
2551 * data->start_class */
2552 StructCopy(data->start_class, &and_with,
2553 struct regnode_charclass_class);
2554 flags &= ~SCF_DO_STCLASS_AND;
2555 StructCopy(&this_class, data->start_class,
2556 struct regnode_charclass_class);
2557 flags |= SCF_DO_STCLASS_OR;
2558 data->start_class->flags |= ANYOF_EOS;
2559 }
2560 } else { /* Non-zero len */
2561 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2562 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2563 cl_and(data->start_class, &and_with);
2564 }
2565 else if (flags & SCF_DO_STCLASS_AND)
2566 cl_and(data->start_class, &this_class);
2567 flags &= ~SCF_DO_STCLASS;
2568 }
c277df42
IZ
2569 if (!scan) /* It was not CURLYX, but CURLY. */
2570 scan = next;
041457d9
DM
2571 if ( /* ? quantifier ok, except for (?{ ... }) */
2572 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2573 && (minnext == 0) && (deltanext == 0)
99799961 2574 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2575 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2576 && ckWARN(WARN_REGEXP))
b45f050a 2577 {
830247a4 2578 vWARN(RExC_parse,
b45f050a
JF
2579 "Quantifier unexpected on zero-length expression");
2580 }
2581
c277df42 2582 min += minnext * mincount;
b81d288d 2583 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2584 && (minnext + deltanext) > 0)
2585 || deltanext == I32_MAX);
aca2d497 2586 is_inf |= is_inf_internal;
c277df42
IZ
2587 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2588
2589 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2590 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2591 && data->flags & SF_IN_PAR
2592 && !(data->flags & SF_HAS_EVAL)
2593 && !deltanext && minnext == 1 ) {
2594 /* Try to optimize to CURLYN. */
2595 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2596 regnode * const nxt1 = nxt;
497b47a8
JH
2597#ifdef DEBUGGING
2598 regnode *nxt2;
2599#endif
c277df42
IZ
2600
2601 /* Skip open. */
2602 nxt = regnext(nxt);
bfed75c6 2603 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2604 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2605 && STR_LEN(nxt) == 1))
c277df42 2606 goto nogo;
497b47a8 2607#ifdef DEBUGGING
c277df42 2608 nxt2 = nxt;
497b47a8 2609#endif
c277df42 2610 nxt = regnext(nxt);
b81d288d 2611 if (OP(nxt) != CLOSE)
c277df42
IZ
2612 goto nogo;
2613 /* Now we know that nxt2 is the only contents: */
eb160463 2614 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2615 OP(oscan) = CURLYN;
2616 OP(nxt1) = NOTHING; /* was OPEN. */
2617#ifdef DEBUGGING
2618 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2619 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2620 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2621 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2622 OP(nxt + 1) = OPTIMIZED; /* was count. */
2623 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2624#endif
c277df42 2625 }
c277df42
IZ
2626 nogo:
2627
2628 /* Try optimization CURLYX => CURLYM. */
b81d288d 2629 if ( OP(oscan) == CURLYX && data
c277df42 2630 && !(data->flags & SF_HAS_PAR)
c277df42 2631 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2632 && !deltanext /* atom is fixed width */
2633 && minnext != 0 /* CURLYM can't handle zero width */
2634 ) {
c277df42
IZ
2635 /* XXXX How to optimize if data == 0? */
2636 /* Optimize to a simpler form. */
2637 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2638 regnode *nxt2;
2639
2640 OP(oscan) = CURLYM;
2641 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2642 && (OP(nxt2) != WHILEM))
c277df42
IZ
2643 nxt = nxt2;
2644 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2645 /* Need to optimize away parenths. */
2646 if (data->flags & SF_IN_PAR) {
2647 /* Set the parenth number. */
2648 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2649
b81d288d 2650 if (OP(nxt) != CLOSE)
b45f050a 2651 FAIL("Panic opt close");
eb160463 2652 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2653 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2654 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2655#ifdef DEBUGGING
2656 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2657 OP(nxt + 1) = OPTIMIZED; /* was count. */
2658 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2659 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2660#endif
c277df42
IZ
2661#if 0
2662 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2663 regnode *nnxt = regnext(nxt1);
b81d288d 2664
c277df42
IZ
2665 if (nnxt == nxt) {
2666 if (reg_off_by_arg[OP(nxt1)])
2667 ARG_SET(nxt1, nxt2 - nxt1);
2668 else if (nxt2 - nxt1 < U16_MAX)
2669 NEXT_OFF(nxt1) = nxt2 - nxt1;
2670 else
2671 OP(nxt) = NOTHING; /* Cannot beautify */
2672 }
2673 nxt1 = nnxt;
2674 }
2675#endif
2676 /* Optimize again: */
b81d288d 2677 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2678 NULL, 0,depth+1);
a0ed51b3
LW
2679 }
2680 else
c277df42 2681 oscan->flags = 0;
c277df42 2682 }
e1901655
IZ
2683 else if ((OP(oscan) == CURLYX)
2684 && (flags & SCF_WHILEM_VISITED_POS)
2685 /* See the comment on a similar expression above.
2686 However, this time it not a subexpression
2687 we care about, but the expression itself. */
2688 && (maxcount == REG_INFTY)
2689 && data && ++data->whilem_c < 16) {
2690 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2691 /* Find WHILEM (as in regexec.c) */
2692 regnode *nxt = oscan + NEXT_OFF(oscan);
2693
2694 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2695 nxt += ARG(nxt);
eb160463
GS
2696 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2697 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2698 }
b81d288d 2699 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2700 pars++;
2701 if (flags & SCF_DO_SUBSTR) {
c445ea15 2702 SV *last_str = NULL;
c277df42
IZ
2703 int counted = mincount != 0;
2704
2705 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2706#if defined(SPARC64_GCC_WORKAROUND)
2707 I32 b = 0;
2708 STRLEN l = 0;
cfd0369c 2709 const char *s = NULL;
5d1c421c
JH
2710 I32 old = 0;
2711
2712 if (pos_before >= data->last_start_min)
2713 b = pos_before;
2714 else
2715 b = data->last_start_min;
2716
2717 l = 0;
cfd0369c 2718 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2719 old = b - data->last_start_min;
2720
2721#else
b81d288d 2722 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2723 ? pos_before : data->last_start_min;
2724 STRLEN l;
d4c19fe8 2725 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2726 I32 old = b - data->last_start_min;
5d1c421c 2727#endif
a0ed51b3
LW
2728
2729 if (UTF)
2730 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2731
a0ed51b3 2732 l -= old;
c277df42 2733 /* Get the added string: */
79cb57f6 2734 last_str = newSVpvn(s + old, l);
0e933229
IH
2735 if (UTF)
2736 SvUTF8_on(last_str);
c277df42
IZ
2737 if (deltanext == 0 && pos_before == b) {
2738 /* What was added is a constant string */
2739 if (mincount > 1) {
2740 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2741 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2742 SvPVX_const(last_str), l, mincount - 1);
b162af07 2743 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2744 /* Add additional parts. */
b81d288d 2745 SvCUR_set(data->last_found,
c277df42
IZ
2746 SvCUR(data->last_found) - l);
2747 sv_catsv(data->last_found, last_str);
0eda9292
JH
2748 {
2749 SV * sv = data->last_found;
2750 MAGIC *mg =
2751 SvUTF8(sv) && SvMAGICAL(sv) ?
2752 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2753 if (mg && mg->mg_len >= 0)
2754 mg->mg_len += CHR_SVLEN(last_str);
2755 }
c277df42
IZ
2756 data->last_end += l * (mincount - 1);
2757 }
2a8d9689
HS
2758 } else {
2759 /* start offset must point into the last copy */
2760 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2761 data->last_start_max += is_inf ? I32_MAX
2762 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2763 }
2764 }
2765 /* It is counted once already... */
2766 data->pos_min += minnext * (mincount - counted);
2767 data->pos_delta += - counted * deltanext +
2768 (minnext + deltanext) * maxcount - minnext * mincount;
2769 if (mincount != maxcount) {
653099ff
GS
2770 /* Cannot extend fixed substrings found inside
2771 the group. */
830247a4 2772 scan_commit(pRExC_state,data);
c277df42 2773 if (mincount && last_str) {
d4c19fe8
AL
2774 SV * const sv = data->last_found;
2775 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2776 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2777
2778 if (mg)
2779 mg->mg_len = -1;
2780 sv_setsv(sv, last_str);
c277df42 2781 data->last_end = data->pos_min;
b81d288d 2782 data->last_start_min =
a0ed51b3 2783 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2784 data->last_start_max = is_inf
2785 ? I32_MAX
c277df42 2786 : data->pos_min + data->pos_delta
a0ed51b3 2787 - CHR_SVLEN(last_str);
c277df42
IZ
2788 }
2789 data->longest = &(data->longest_float);
2790 }
aca2d497 2791 SvREFCNT_dec(last_str);
c277df42 2792 }
405ff068 2793 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2794 data->flags |= SF_HAS_EVAL;
2795 optimize_curly_tail:
c277df42 2796 if (OP(oscan) != CURLYX) {
3dab1dad 2797 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2798 && NEXT_OFF(next))
2799 NEXT_OFF(oscan) += NEXT_OFF(next);
2800 }
c277df42 2801 continue;
653099ff 2802 default: /* REF and CLUMP only? */
c277df42 2803 if (flags & SCF_DO_SUBSTR) {
830247a4 2804 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2805 data->longest = &(data->longest_float);
2806 }
aca2d497 2807 is_inf = is_inf_internal = 1;
653099ff 2808 if (flags & SCF_DO_STCLASS_OR)
830247a4 2809 cl_anything(pRExC_state, data->start_class);
653099ff 2810 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2811 break;
2812 }
a0ed51b3 2813 }
bfed75c6 2814 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2815 int value = 0;
653099ff 2816
c277df42 2817 if (flags & SCF_DO_SUBSTR) {
830247a4 2818 scan_commit(pRExC_state,data);
c277df42
IZ
2819 data->pos_min++;
2820 }
2821 min++;
653099ff
GS
2822 if (flags & SCF_DO_STCLASS) {
2823 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2824
2825 /* Some of the logic below assumes that switching
2826 locale on will only add false positives. */
3dab1dad 2827 switch (PL_regkind[OP(scan)]) {
653099ff 2828 case SANY:
653099ff
GS
2829 default:
2830 do_default:
2831 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2832 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2833 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2834 break;
2835 case REG_ANY:
2836 if (OP(scan) == SANY)
2837 goto do_default;
2838 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2839 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2840 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2841 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2842 }
2843 if (flags & SCF_DO_STCLASS_AND || !value)
2844 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2845 break;
2846 case ANYOF:
2847 if (flags & SCF_DO_STCLASS_AND)
2848 cl_and(data->start_class,
2849 (struct regnode_charclass_class*)scan);
2850 else
830247a4 2851 cl_or(pRExC_state, data->start_class,
653099ff
GS
2852 (struct regnode_charclass_class*)scan);
2853 break;
2854 case ALNUM:
2855 if (flags & SCF_DO_STCLASS_AND) {
2856 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2857 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2858 for (value = 0; value < 256; value++)
2859 if (!isALNUM(value))
2860 ANYOF_BITMAP_CLEAR(data->start_class, value);
2861 }
2862 }
2863 else {
2864 if (data->start_class->flags & ANYOF_LOCALE)
2865 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2866 else {
2867 for (value = 0; value < 256; value++)
2868 if (isALNUM(value))
b81d288d 2869 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2870 }
2871 }
2872 break;
2873 case ALNUML:
2874 if (flags & SCF_DO_STCLASS_AND) {
2875 if (data->start_class->flags & ANYOF_LOCALE)
2876 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2877 }
2878 else {
2879 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2880 data->start_class->flags |= ANYOF_LOCALE;
2881 }
2882 break;
2883 case NALNUM:
2884 if (flags & SCF_DO_STCLASS_AND) {
2885 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2886 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2887 for (value = 0; value < 256; value++)
2888 if (isALNUM(value))
2889 ANYOF_BITMAP_CLEAR(data->start_class, value);
2890 }
2891 }
2892 else {
2893 if (data->start_class->flags & ANYOF_LOCALE)
2894 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2895 else {
2896 for (value = 0; value < 256; value++)
2897 if (!isALNUM(value))
b81d288d 2898 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2899 }
2900 }
2901 break;
2902 case NALNUML:
2903 if (flags & SCF_DO_STCLASS_AND) {
2904 if (data->start_class->flags & ANYOF_LOCALE)
2905 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2906 }
2907 else {
2908 data->start_class->flags |= ANYOF_LOCALE;
2909 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2910 }
2911 break;
2912 case SPACE:
2913 if (flags & SCF_DO_STCLASS_AND) {
2914 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2915 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2916 for (value = 0; value < 256; value++)
2917 if (!isSPACE(value))
2918 ANYOF_BITMAP_CLEAR(data->start_class, value);
2919 }
2920 }
2921 else {
2922 if (data->start_class->flags & ANYOF_LOCALE)
2923 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2924 else {
2925 for (value = 0; value < 256; value++)
2926 if (isSPACE(value))
b81d288d 2927 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2928 }
2929 }
2930 break;
2931 case SPACEL:
2932 if (flags & SCF_DO_STCLASS_AND) {
2933 if (data->start_class->flags & ANYOF_LOCALE)
2934 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2935 }
2936 else {
2937 data->start_class->flags |= ANYOF_LOCALE;
2938 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2939 }
2940 break;
2941 case NSPACE:
2942 if (flags & SCF_DO_STCLASS_AND) {
2943 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2944 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2945 for (value = 0; value < 256; value++)
2946 if (isSPACE(value))
2947 ANYOF_BITMAP_CLEAR(data->start_class, value);
2948 }
2949 }
2950 else {
2951 if (data->start_class->flags & ANYOF_LOCALE)
2952 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2953 else {
2954 for (value = 0; value < 256; value++)
2955 if (!isSPACE(value))
b81d288d 2956 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2957 }
2958 }
2959 break;
2960 case NSPACEL:
2961 if (flags & SCF_DO_STCLASS_AND) {
2962 if (data->start_class->flags & ANYOF_LOCALE) {
2963 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2964 for (value = 0; value < 256; value++)
2965 if (!isSPACE(value))
2966 ANYOF_BITMAP_CLEAR(data->start_class, value);
2967 }
2968 }
2969 else {
2970 data->start_class->flags |= ANYOF_LOCALE;
2971 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2972 }
2973 break;
2974 case DIGIT:
2975 if (flags & SCF_DO_STCLASS_AND) {
2976 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2977 for (value = 0; value < 256; value++)
2978 if (!isDIGIT(value))
2979 ANYOF_BITMAP_CLEAR(data->start_class, value);
2980 }
2981 else {
2982 if (data->start_class->flags & ANYOF_LOCALE)
2983 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2984 else {
2985 for (value = 0; value < 256; value++)
2986 if (isDIGIT(value))
b81d288d 2987 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2988 }
2989 }
2990 break;
2991 case NDIGIT:
2992 if (flags & SCF_DO_STCLASS_AND) {
2993 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2994 for (value = 0; value < 256; value++)
2995 if (isDIGIT(value))
2996 ANYOF_BITMAP_CLEAR(data->start_class, value);
2997 }
2998 else {
2999 if (data->start_class->flags & ANYOF_LOCALE)
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3001 else {
3002 for (value = 0; value < 256; value++)
3003 if (!isDIGIT(value))
b81d288d 3004 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3005 }
3006 }
3007 break;
3008 }
3009 if (flags & SCF_DO_STCLASS_OR)
3010 cl_and(data->start_class, &and_with);
3011 flags &= ~SCF_DO_STCLASS;
3012 }
a0ed51b3 3013 }
3dab1dad 3014 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3015 data->flags |= (OP(scan) == MEOL
3016 ? SF_BEFORE_MEOL
3017 : SF_BEFORE_SEOL);
a0ed51b3 3018 }
3dab1dad 3019 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3020 /* Lookbehind, or need to calculate parens/evals/stclass: */
3021 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3022 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3023 /* Lookahead/lookbehind */
cb434fcc 3024 I32 deltanext, minnext, fake = 0;
c277df42 3025 regnode *nscan;
653099ff
GS
3026 struct regnode_charclass_class intrnl;
3027 int f = 0;
c277df42
IZ
3028
3029 data_fake.flags = 0;
b81d288d 3030 if (data) {
2c2d71f5 3031 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3032 data_fake.last_closep = data->last_closep;
3033 }
3034 else
3035 data_fake.last_closep = &fake;
653099ff
GS
3036 if ( flags & SCF_DO_STCLASS && !scan->flags
3037 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3038 cl_init(pRExC_state, &intrnl);
653099ff 3039 data_fake.start_class = &intrnl;
e1901655 3040 f |= SCF_DO_STCLASS_AND;
653099ff 3041 }
e1901655
IZ
3042 if (flags & SCF_WHILEM_VISITED_POS)
3043 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3044 next = regnext(scan);
3045 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3046 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3047 if (scan->flags) {
3048 if (deltanext) {
9baa0206 3049 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
3050 }
3051 else if (minnext > U8_MAX) {
9baa0206 3052 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3053 }
eb160463 3054 scan->flags = (U8)minnext;
c277df42
IZ
3055 }
3056 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3057 pars++;
405ff068 3058 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 3059 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
3060 if (data)
3061 data->whilem_c = data_fake.whilem_c;
e1901655 3062 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3063 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3064
3065 cl_and(data->start_class, &intrnl);
3066 if (was)
3067 data->start_class->flags |= ANYOF_EOS;
3068 }
a0ed51b3
LW
3069 }
3070 else if (OP(scan) == OPEN) {
c277df42 3071 pars++;
a0ed51b3 3072 }
cb434fcc 3073 else if (OP(scan) == CLOSE) {
eb160463 3074 if ((I32)ARG(scan) == is_par) {
cb434fcc 3075 next = regnext(scan);
c277df42 3076
cb434fcc
IZ
3077 if ( next && (OP(next) != WHILEM) && next < last)
3078 is_par = 0; /* Disable optimization */
3079 }
3080 if (data)
3081 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3082 }
3083 else if (OP(scan) == EVAL) {
c277df42
IZ
3084 if (data)
3085 data->flags |= SF_HAS_EVAL;
3086 }
96776eda 3087 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3088 if (flags & SCF_DO_SUBSTR) {
830247a4 3089 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3090 data->longest = &(data->longest_float);
3091 }
3092 is_inf = is_inf_internal = 1;
653099ff 3093 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3094 cl_anything(pRExC_state, data->start_class);
96776eda 3095 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3096 }
c277df42
IZ
3097 /* Else: zero-length, ignore. */
3098 scan = regnext(scan);
3099 }
3100
3101 finish:
3102 *scanp = scan;
aca2d497 3103 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3104 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
3105 data->pos_delta = I32_MAX - data->pos_min;
3106 if (is_par > U8_MAX)
3107 is_par = 0;
3108 if (is_par && pars==1 && data) {
3109 data->flags |= SF_IN_PAR;
3110 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3111 }
3112 else if (pars && data) {
c277df42
IZ
3113 data->flags |= SF_HAS_PAR;
3114 data->flags &= ~SF_IN_PAR;
3115 }
653099ff
GS
3116 if (flags & SCF_DO_STCLASS_OR)
3117 cl_and(data->start_class, &and_with);
07be1b83
YO
3118 if (flags & SCF_EXACT_TRIE)
3119 data->flags |= SCF_EXACT_TRIE;
c277df42
IZ
3120 return min;
3121}
3122
76e3520e 3123STATIC I32
5f66b61c 3124S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3125{
830247a4 3126 if (RExC_rx->data) {
b81d288d
AB
3127 Renewc(RExC_rx->data,
3128 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3129 char, struct reg_data);
830247a4
IZ
3130 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3131 RExC_rx->data->count += n;
a0ed51b3
LW
3132 }
3133 else {
a02a5408 3134 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3135 char, struct reg_data);
a02a5408 3136 Newx(RExC_rx->data->what, n, U8);
830247a4 3137 RExC_rx->data->count = n;
c277df42 3138 }
830247a4
IZ
3139 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3140 return RExC_rx->data->count - n;
c277df42
IZ
3141}
3142
76234dfb 3143#ifndef PERL_IN_XSUB_RE
d88dccdf 3144void
864dbfa3 3145Perl_reginitcolors(pTHX)
d88dccdf 3146{
97aff369 3147 dVAR;
1df70142 3148 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3149 if (s) {
1df70142
AL
3150 char *t = savepv(s);
3151 int i = 0;
3152 PL_colors[0] = t;
d88dccdf 3153 while (++i < 6) {
1df70142
AL
3154 t = strchr(t, '\t');
3155 if (t) {
3156 *t = '\0';
3157 PL_colors[i] = ++t;
d88dccdf
IZ
3158 }
3159 else
1df70142 3160 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3161 }
3162 } else {
1df70142 3163 int i = 0;
b81d288d 3164 while (i < 6)
06b5626a 3165 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3166 }
3167 PL_colorset = 1;
3168}
76234dfb 3169#endif
8615cb43 3170
07be1b83 3171
a687059c 3172/*
e50aee73 3173 - pregcomp - compile a regular expression into internal code
a687059c
LW
3174 *
3175 * We can't allocate space until we know how big the compiled form will be,
3176 * but we can't compile it (and thus know how big it is) until we've got a
3177 * place to put the code. So we cheat: we compile it twice, once with code
3178 * generation turned off and size counting turned on, and once "for real".
3179 * This also means that we don't allocate space until we are sure that the
3180 * thing really will compile successfully, and we never have to move the
3181 * code and thus invalidate pointers into it. (Note that it has to be in
3182 * one piece because free() must be able to free it all.) [NB: not true in perl]
3183 *
3184 * Beware that the optimization-preparation code in here knows about some
3185 * of the structure of the compiled regexp. [I'll say.]
3186 */
3187regexp *
864dbfa3 3188Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3189{
97aff369 3190 dVAR;
a0d0e21e 3191 register regexp *r;
c277df42 3192 regnode *scan;
c277df42 3193 regnode *first;
a0d0e21e 3194 I32 flags;
a0d0e21e
LW
3195 I32 minlen = 0;
3196 I32 sawplus = 0;
3197 I32 sawopen = 0;
2c2d71f5 3198 scan_data_t data;
830247a4
IZ
3199 RExC_state_t RExC_state;
3200 RExC_state_t *pRExC_state = &RExC_state;
07be1b83
YO
3201#ifdef TRIE_STUDY_OPT
3202 int restudied= 0;
3203 RExC_state_t copyRExC_state;
3204#endif
a0d0e21e 3205
a3621e74
YO
3206 GET_RE_DEBUG_FLAGS_DECL;
3207
a0d0e21e 3208 if (exp == NULL)
c277df42 3209 FAIL("NULL regexp argument");
a0d0e21e 3210
a5961de5 3211 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3212
5cfc7842 3213 RExC_precomp = exp;
a3621e74
YO
3214 DEBUG_r(if (!PL_colorset) reginitcolors());
3215 DEBUG_COMPILE_r({
3216 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3217 PL_colors[4],PL_colors[5],PL_colors[0],
3218 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3219 });
e2509266 3220 RExC_flags = pm->op_pmflags;
830247a4 3221 RExC_sawback = 0;
bbce6d69 3222
830247a4
IZ
3223 RExC_seen = 0;
3224 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3225 RExC_seen_evals = 0;
3226 RExC_extralen = 0;
c277df42 3227
bbce6d69 3228 /* First pass: determine size, legality. */
830247a4 3229 RExC_parse = exp;
fac92740 3230 RExC_start = exp;
830247a4
IZ
3231 RExC_end = xend;
3232 RExC_naughty = 0;
3233 RExC_npar = 1;
3234 RExC_size = 0L;
3235 RExC_emit = &PL_regdummy;
3236 RExC_whilem_seen = 0;
85ddcde9
JH
3237#if 0 /* REGC() is (currently) a NOP at the first pass.
3238 * Clever compilers notice this and complain. --jhi */
830247a4 3239 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3240#endif
3dab1dad
YO
3241 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3242 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3243 RExC_precomp = NULL;
a0d0e21e
LW
3244 return(NULL);
3245 }
3dab1dad
YO
3246 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3247 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3248 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3249 DEBUG_PARSE_r({
3250 RExC_lastnum=0;
3251 RExC_lastparse=NULL;
3252 });
c277df42 3253
07be1b83 3254
c277df42
IZ
3255 /* Small enough for pointer-storage convention?
3256 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3257 if (RExC_size >= 0x10000L && RExC_extralen)
3258 RExC_size += RExC_extralen;
c277df42 3259 else
830247a4
IZ
3260 RExC_extralen = 0;
3261 if (RExC_whilem_seen > 15)
3262 RExC_whilem_seen = 15;
a0d0e21e 3263
bbce6d69 3264 /* Allocate space and initialize. */
a02a5408 3265 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3266 char, regexp);
a0d0e21e 3267 if (r == NULL)
b45f050a
JF
3268 FAIL("Regexp out of space");
3269
0f79a09d
GS
3270#ifdef DEBUGGING
3271 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3272 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3273#endif
c277df42 3274 r->refcnt = 1;
bbce6d69 3275 r->prelen = xend - exp;
5cfc7842 3276 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3277 r->subbeg = NULL;
f8c7b90f 3278#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3279 r->saved_copy = NULL;
ed252734 3280#endif
cf93c79d 3281 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3282 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3283 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3284
3285 r->substrs = 0; /* Useful during FAIL. */
3286 r->startp = 0; /* Useful during FAIL. */
3287 r->endp = 0; /* Useful during FAIL. */
3288
a02a5408 3289 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3290 if (r->offsets) {
2af232bd 3291 r->offsets[0] = RExC_size;
fac92740 3292 }
a3621e74 3293 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3294 "%s %"UVuf" bytes for offset annotations.\n",
3295 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3296 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3297
830247a4 3298 RExC_rx = r;
bbce6d69 3299
3300 /* Second pass: emit code. */
e2509266 3301 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3302 RExC_parse = exp;
3303 RExC_end = xend;
3304 RExC_naughty = 0;
3305 RExC_npar = 1;
fac92740 3306 RExC_emit_start = r->program;
830247a4 3307 RExC_emit = r->program;
2cd61cdb 3308 /* Store the count of eval-groups for security checks: */
eb160463 3309 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3310 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3311 r->data = 0;
3dab1dad 3312 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3313 return(NULL);
07be1b83
YO
3314 /* XXXX To minimize changes to RE engine we always allocate
3315 3-units-long substrs field. */
3316 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3317
07be1b83
YO
3318reStudy:
3319 Zero(r->substrs, 1, struct reg_substr_data);
3320 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3321
07be1b83
YO
3322#ifdef TRIE_STUDY_OPT
3323 if ( restudied ) {
3324 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3325 RExC_state=copyRExC_state;
3326 if (data.longest_fixed)
3327 SvREFCNT_dec(data.longest_fixed);
3328 if (data.longest_float)
3329 SvREFCNT_dec(data.longest_float);
3330 if (data.last_found)
3331 SvREFCNT_dec(data.last_found);
3332 } else {
3333 copyRExC_state=RExC_state;
3334 }
3335#endif
a0d0e21e 3336 /* Dig out information for optimizations. */
cf93c79d 3337 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3338 pm->op_pmflags = RExC_flags;
a0ed51b3 3339 if (UTF)
5ff6fc6d 3340 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3341 r->regstclass = NULL;
830247a4 3342 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3343 r->reganch |= ROPT_NAUGHTY;
c277df42 3344 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3345
653099ff 3346 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3347 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3348 I32 fake;
c5254dd6 3349 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3350 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3351 int stclass_flag;
07be1b83 3352 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3353
3354 first = scan;
c277df42 3355 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3356 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3357 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3358 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3359 /* for now we can't handle lookbehind IFMATCH*/
3360 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3361 (OP(first) == PLUS) ||
3362 (OP(first) == MINMOD) ||
653099ff 3363 /* An {n,m} with n>0 */
07be1b83
YO
3364 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3365 {
3366 DEBUG_PEEP("first:",first,0);
a0d0e21e
LW
3367 if (OP(first) == PLUS)
3368 sawplus = 1;
3369 else
3dab1dad 3370 first += regarglen[OP(first)];
07be1b83
YO
3371 if (OP(first) == IFMATCH) {
3372 first = NEXTOPER(first);
3373 first += EXTRA_STEP_2ARGS;
3374 } else /*xxx possible optimisation for /(?=)/*/
3375 first = NEXTOPER(first);
a687059c
LW
3376 }
3377
a0d0e21e
LW
3378 /* Starting-point info. */
3379 again:
07be1b83 3380 /* Ignore EXACT as we deal with it later. */
3dab1dad 3381 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3382 if (OP(first) == EXACT)
6f207bd3 3383 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3384 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3385 r->regstclass = first;
3386 }
07be1b83
YO
3387#ifdef TRIE_STCLASS
3388 else if (OP(first) == TRIE &&
3389 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3390 {
3391 /* this can happen only on restudy */
3392 struct regnode_1 *trie_op;
3393 Newxz(trie_op,1,struct regnode_1);
3394 StructCopy(first,trie_op,struct regnode_1);
3395 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3396 r->regstclass = (regnode *)trie_op;
3397 }
3398#endif
bfed75c6 3399 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3400 r->regstclass = first;
3dab1dad
YO
3401 else if (PL_regkind[OP(first)] == BOUND ||
3402 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3403 r->regstclass = first;
3dab1dad 3404 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3405 r->reganch |= (OP(first) == MBOL
3406 ? ROPT_ANCH_MBOL
3407 : (OP(first) == SBOL
3408 ? ROPT_ANCH_SBOL
3409 : ROPT_ANCH_BOL));
a0d0e21e 3410 first = NEXTOPER(first);
774d564b 3411 goto again;
3412 }
3413 else if (OP(first) == GPOS) {
3414 r->reganch |= ROPT_ANCH_GPOS;
3415 first = NEXTOPER(first);
3416 goto again;
a0d0e21e 3417 }
e09294f4 3418 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3419 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3420 !(r->reganch & ROPT_ANCH) )
3421 {
3422 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3423 const int type =
3424 (OP(NEXTOPER(first)) == REG_ANY)
3425 ? ROPT_ANCH_MBOL
3426 : ROPT_ANCH_SBOL;
cad2e5aa 3427 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3428 first = NEXTOPER(first);
774d564b 3429 goto again;
a0d0e21e 3430 }
b81d288d 3431 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3432 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3433 /* x+ must match at the 1st pos of run of x's */
3434 r->reganch |= ROPT_SKIP;
a0d0e21e 3435
c277df42 3436 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3437 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3438 (IV)(first - scan + 1)));
a0d0e21e
LW
3439 /*
3440 * If there's something expensive in the r.e., find the
3441 * longest literal string that must appear and make it the
3442 * regmust. Resolve ties in favor of later strings, since
3443 * the regstart check works with the beginning of the r.e.
3444 * and avoiding duplication strengthens checking. Not a
3445 * strong reason, but sufficient in the absence of others.
3446 * [Now we resolve ties in favor of the earlier string if
c277df42 3447 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3448 * earlier string may buy us something the later one won't.]
3449 */
a0d0e21e 3450 minlen = 0;
a687059c 3451
396482e1
GA
3452 data.longest_fixed = newSVpvs("");
3453 data.longest_float = newSVpvs("");
3454 data.last_found = newSVpvs("");
c277df42
IZ
3455 data.longest = &(data.longest_fixed);
3456 first = scan;
653099ff 3457 if (!r->regstclass) {
830247a4 3458 cl_init(pRExC_state, &ch_class);
653099ff
GS
3459 data.start_class = &ch_class;
3460 stclass_flag = SCF_DO_STCLASS_AND;
3461 } else /* XXXX Check for BOUND? */
3462 stclass_flag = 0;
cb434fcc 3463 data.last_closep = &last_close;
653099ff 3464
830247a4 3465 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3466 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83
YO
3467
3468#ifdef TRIE_STUDY_OPT
3469 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3470 goto reStudy;
3471 }
3472#endif
3473
830247a4 3474 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3475 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3476 && !RExC_seen_zerolen
3477 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3478 r->reganch |= ROPT_CHECK_ALL;
830247a4 3479 scan_commit(pRExC_state, &data);
c277df42
IZ
3480 SvREFCNT_dec(data.last_found);
3481
a0ed51b3 3482 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3483 if (longest_float_length
c277df42
IZ
3484 || (data.flags & SF_FL_BEFORE_EOL
3485 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3486 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3487 int t;
3488
a0ed51b3 3489 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3490 && data.offset_fixed == data.offset_float_min
3491 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3492 goto remove_float; /* As in (a)+. */
3493
33b8afdf
JH
3494 if (SvUTF8(data.longest_float)) {
3495 r->float_utf8 = data.longest_float;
c445ea15 3496 r->float_substr = NULL;
33b8afdf
JH
3497 } else {
3498 r->float_substr = data.longest_float;
c445ea15 3499 r->float_utf8 = NULL;
33b8afdf 3500 }
c277df42
IZ
3501 r->float_min_offset = data.offset_float_min;
3502 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3503 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3504 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3505 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3506 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3507 }
3508 else {
aca2d497 3509 remove_float:
c445ea15 3510 r->float_substr = r->float_utf8 = NULL;
c277df42 3511 SvREFCNT_dec(data.longest_float);
c5254dd6 3512 longest_float_length = 0;
a0d0e21e 3513 }
c277df42 3514
a0ed51b3 3515 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3516 if (longest_fixed_length
c277df42
IZ
3517 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3518 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3519 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3520 int t;
3521
33b8afdf
JH
3522 if (SvUTF8(data.longest_fixed)) {
3523 r->anchored_utf8 = data.longest_fixed;
c445ea15 3524 r->anchored_substr = NULL;
33b8afdf
JH
3525 } else {
3526 r->anchored_substr = data.longest_fixed;
c445ea15 3527 r->anchored_utf8 = NULL;
33b8afdf 3528 }
c277df42 3529 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3530 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3531 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3532 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3533 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3534 }
3535 else {
c445ea15 3536 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3537 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3538 longest_fixed_length = 0;
a0d0e21e 3539 }
b81d288d 3540 if (r->regstclass
ffc61ed2 3541 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3542 r->regstclass = NULL;
33b8afdf
JH
3543 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3544 && stclass_flag
653099ff 3545 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3546 && !cl_is_anything(data.start_class))
3547 {
1df70142 3548 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3549
a02a5408 3550 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3551 struct regnode_charclass_class);
3552 StructCopy(data.start_class,
830247a4 3553 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3554 struct regnode_charclass_class);
830247a4 3555 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3556 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3557 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3558 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3559 PerlIO_printf(Perl_debug_log,
a0288114 3560 "synthetic stclass \"%s\".\n",
3f7c398e 3561 SvPVX_const(sv));});
653099ff 3562 }
c277df42
IZ
3563
3564 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3565 if (longest_fixed_length > longest_float_length) {
c277df42 3566 r->check_substr = r->anchored_substr;
33b8afdf 3567 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3568 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3569 if (r->reganch & ROPT_ANCH_SINGLE)
3570 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3571 }
3572 else {
c277df42 3573 r->check_substr = r->float_substr;
33b8afdf 3574 r->check_utf8 = r->float_utf8;
c277df42
IZ
3575 r->check_offset_min = data.offset_float_min;
3576 r->check_offset_max = data.offset_float_max;
a0d0e21e 3577 }
30382c73
IZ
3578 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3579 This should be changed ASAP! */
33b8afdf 3580 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3581 r->reganch |= RE_USE_INTUIT;
33b8afdf 3582 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3583 r->reganch |= RE_INTUIT_TAIL;
3584 }
a0ed51b3
LW
3585 }
3586 else {
c277df42
IZ
3587 /* Several toplevels. Best we can is to set minlen. */
3588 I32 fake;
653099ff 3589 struct regnode_charclass_class ch_class;
cb434fcc 3590 I32 last_close = 0;
c277df42 3591
a3621e74 3592 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3593
c277df42 3594 scan = r->program + 1;
830247a4 3595 cl_init(pRExC_state, &ch_class);
653099ff 3596 data.start_class = &ch_class;
cb434fcc 3597 data.last_closep = &last_close;
07be1b83
YO
3598
3599 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3600 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3601
3602#ifdef TRIE_STUDY_OPT
3603 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3604 goto reStudy;
3605 }
3606#endif
3607
33b8afdf 3608 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3609 = r->float_substr = r->float_utf8 = NULL;
653099ff 3610 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3611 && !cl_is_anything(data.start_class))
3612 {
1df70142 3613 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3614
a02a5408 3615 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3616 struct regnode_charclass_class);
3617 StructCopy(data.start_class,
830247a4 3618 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3619 struct regnode_charclass_class);
830247a4 3620 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3621 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3622 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3623 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3624 PerlIO_printf(Perl_debug_log,
a0288114 3625 "synthetic stclass \"%s\".\n",
3f7c398e 3626 SvPVX_const(sv));});
653099ff 3627 }
a0d0e21e
LW
3628 }
3629
a0d0e21e 3630 r->minlen = minlen;
b81d288d 3631 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3632 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3633 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3634 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3635 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3636 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3637 if (RExC_seen & REG_SEEN_CANY)
3638 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3639 Newxz(r->startp, RExC_npar, I32);
3640 Newxz(r->endp, RExC_npar, I32);
07be1b83 3641
f2278c82 3642 DEBUG_r( RX_DEBUG_on(r) );
3dab1dad
YO
3643 DEBUG_COMPILE_r({
3644 if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
3645 PerlIO_printf(Perl_debug_log,"Final program:\n");
3646 regdump(r);
3647 });
a0d0e21e 3648 return(r);
a687059c
LW
3649}
3650
3dab1dad
YO
3651
3652#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3653 int rem=(int)(RExC_end - RExC_parse); \
3654 int cut; \
3655 int num; \
3656 int iscut=0; \
3657 if (rem>10) { \
3658 rem=10; \
3659 iscut=1; \