This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] More regex optimisations and debug enhancements (including Andys stuff...
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad
YO
125#ifdef DEBUGGING
126 char *lastparse;
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
27da23d5
JH
198static const scan_data_t zero_scan_data =
199 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
eb160463 230#define UTF (RExC_utf8 != 0)
e2509266
JH
231#define LOC ((RExC_flags & PMf_LOCALE) != 0)
232#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
ccb2c380 259#define FAIL(msg) STMT_START { \
bfed75c6 260 const char *ellipses = ""; \
ccb2c380
MP
261 IV len = RExC_end - RExC_precomp; \
262 \
263 if (!SIZE_ONLY) \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
268 ellipses = "..."; \
269 } \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
272} STMT_END
8615cb43 273
b45f050a 274/*
b45f050a
JF
275 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
276 */
ccb2c380 277#define Simple_vFAIL(m) STMT_START { \
a28509cc 278 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
279 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
280 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
281} STMT_END
b45f050a
JF
282
283/*
284 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
285 */
ccb2c380
MP
286#define vFAIL(m) STMT_START { \
287 if (!SIZE_ONLY) \
288 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 Simple_vFAIL(m); \
290} STMT_END
b45f050a
JF
291
292/*
293 * Like Simple_vFAIL(), but accepts two arguments.
294 */
ccb2c380 295#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 296 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
297 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
298 (int)offset, RExC_precomp, RExC_precomp + offset); \
299} STMT_END
b45f050a
JF
300
301/*
302 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
303 */
ccb2c380
MP
304#define vFAIL2(m,a1) STMT_START { \
305 if (!SIZE_ONLY) \
306 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
307 Simple_vFAIL2(m, a1); \
308} STMT_END
b45f050a
JF
309
310
311/*
312 * Like Simple_vFAIL(), but accepts three arguments.
313 */
ccb2c380 314#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 315 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
316 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
317 (int)offset, RExC_precomp, RExC_precomp + offset); \
318} STMT_END
b45f050a
JF
319
320/*
321 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
322 */
ccb2c380
MP
323#define vFAIL3(m,a1,a2) STMT_START { \
324 if (!SIZE_ONLY) \
325 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
326 Simple_vFAIL3(m, a1, a2); \
327} STMT_END
b45f050a
JF
328
329/*
330 * Like Simple_vFAIL(), but accepts four arguments.
331 */
ccb2c380 332#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 333 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
334 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
335 (int)offset, RExC_precomp, RExC_precomp + offset); \
336} STMT_END
b45f050a 337
ccb2c380 338#define vWARN(loc,m) STMT_START { \
a28509cc 339 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
340 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
341 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
342} STMT_END
343
344#define vWARNdep(loc,m) STMT_START { \
a28509cc 345 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
346 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
347 "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351
352#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 353 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
354 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
355 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 359 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
360 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
361 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
362} STMT_END
363
364#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 365 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
366 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
367 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
369
370#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 371 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
373 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
9d1d55b5 375
8615cb43 376
cd439c50 377/* Allow for side effects in s */
ccb2c380
MP
378#define REGC(c,s) STMT_START { \
379 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
380} STMT_END
cd439c50 381
fac92740
MJD
382/* Macros for recording node offsets. 20001227 mjd@plover.com
383 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
384 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
385 * Element 0 holds the number n.
386 */
387
388#define MJD_OFFSET_DEBUG(x)
a3621e74 389/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
390
391
392#define Set_Node_Offset_To_R(node,byte) STMT_START { \
393 if (! SIZE_ONLY) { \
394 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
395 __LINE__, (node), (byte))); \
396 if((node) < 0) { \
551405c4 397 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
398 } else { \
399 RExC_offsets[2*(node)-1] = (byte); \
400 } \
401 } \
402} STMT_END
403
404#define Set_Node_Offset(node,byte) \
405 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
406#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
407
408#define Set_Node_Length_To_R(node,len) STMT_START { \
409 if (! SIZE_ONLY) { \
410 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 411 __LINE__, (int)(node), (int)(len))); \
ccb2c380 412 if((node) < 0) { \
551405c4 413 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
414 } else { \
415 RExC_offsets[2*(node)] = (len); \
416 } \
417 } \
418} STMT_END
419
420#define Set_Node_Length(node,len) \
421 Set_Node_Length_To_R((node)-RExC_emit_start, len)
422#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
423#define Set_Node_Cur_Length(node) \
424 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
425
426/* Get offsets and lengths */
427#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
428#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
429
acfe0abc 430static void clear_re(pTHX_ void *r);
4327152a 431
653099ff
GS
432/* Mark that we cannot extend a found fixed substring at this point.
433 Updata the longest found anchored substring and the longest found
434 floating substrings if needed. */
435
4327152a 436STATIC void
097eb12c 437S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 438{
e1ec3a88
AL
439 const STRLEN l = CHR_SVLEN(data->last_found);
440 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 441
c277df42 442 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 443 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
444 if (*data->longest == data->longest_fixed) {
445 data->offset_fixed = l ? data->last_start_min : data->pos_min;
446 if (data->flags & SF_BEFORE_EOL)
b81d288d 447 data->flags
c277df42
IZ
448 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
449 else
450 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
451 }
452 else {
c277df42 453 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
454 data->offset_float_max = (l
455 ? data->last_start_max
c277df42 456 : data->pos_min + data->pos_delta);
9051bda5
HS
457 if ((U32)data->offset_float_max > (U32)I32_MAX)
458 data->offset_float_max = I32_MAX;
c277df42 459 if (data->flags & SF_BEFORE_EOL)
b81d288d 460 data->flags
c277df42
IZ
461 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
462 else
463 data->flags &= ~SF_FL_BEFORE_EOL;
464 }
465 }
466 SvCUR_set(data->last_found, 0);
0eda9292 467 {
a28509cc 468 SV * const sv = data->last_found;
097eb12c
AL
469 if (SvUTF8(sv) && SvMAGICAL(sv)) {
470 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
471 if (mg)
472 mg->mg_len = 0;
473 }
0eda9292 474 }
c277df42
IZ
475 data->last_end = -1;
476 data->flags &= ~SF_BEFORE_EOL;
477}
478
653099ff
GS
479/* Can match anything (initialization) */
480STATIC void
097eb12c 481S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 482{
653099ff 483 ANYOF_CLASS_ZERO(cl);
f8bef550 484 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 485 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
486 if (LOC)
487 cl->flags |= ANYOF_LOCALE;
488}
489
490/* Can match anything (initialization) */
491STATIC int
5f66b61c 492S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
493{
494 int value;
495
aaa51d5e 496 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
497 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
498 return 1;
1aa99e6b
IH
499 if (!(cl->flags & ANYOF_UNICODE_ALL))
500 return 0;
f8bef550
NC
501 if (!ANYOF_BITMAP_TESTALLSET(cl))
502 return 0;
653099ff
GS
503 return 1;
504}
505
506/* Can match anything (initialization) */
507STATIC void
097eb12c 508S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 509{
8ecf7187 510 Zero(cl, 1, struct regnode_charclass_class);
653099ff 511 cl->type = ANYOF;
830247a4 512 cl_anything(pRExC_state, cl);
653099ff
GS
513}
514
515STATIC void
097eb12c 516S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 517{
8ecf7187 518 Zero(cl, 1, struct regnode_charclass_class);
653099ff 519 cl->type = ANYOF;
830247a4 520 cl_anything(pRExC_state, cl);
653099ff
GS
521 if (LOC)
522 cl->flags |= ANYOF_LOCALE;
523}
524
525/* 'And' a given class with another one. Can create false positives */
526/* We assume that cl is not inverted */
527STATIC void
5f66b61c 528S_cl_and(struct regnode_charclass_class *cl,
a28509cc 529 const struct regnode_charclass_class *and_with)
653099ff 530{
653099ff
GS
531 if (!(and_with->flags & ANYOF_CLASS)
532 && !(cl->flags & ANYOF_CLASS)
533 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
534 && !(and_with->flags & ANYOF_FOLD)
535 && !(cl->flags & ANYOF_FOLD)) {
536 int i;
537
538 if (and_with->flags & ANYOF_INVERT)
539 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
540 cl->bitmap[i] &= ~and_with->bitmap[i];
541 else
542 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
543 cl->bitmap[i] &= and_with->bitmap[i];
544 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
545 if (!(and_with->flags & ANYOF_EOS))
546 cl->flags &= ~ANYOF_EOS;
1aa99e6b 547
14ebb1a2
JH
548 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
549 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
550 cl->flags &= ~ANYOF_UNICODE_ALL;
551 cl->flags |= ANYOF_UNICODE;
552 ARG_SET(cl, ARG(and_with));
553 }
14ebb1a2
JH
554 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
555 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 556 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
557 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
558 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 559 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
560}
561
562/* 'OR' a given class with another one. Can create false positives */
563/* We assume that cl is not inverted */
564STATIC void
097eb12c 565S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 566{
653099ff
GS
567 if (or_with->flags & ANYOF_INVERT) {
568 /* We do not use
569 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
570 * <= (B1 | !B2) | (CL1 | !CL2)
571 * which is wasteful if CL2 is small, but we ignore CL2:
572 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
573 * XXXX Can we handle case-fold? Unclear:
574 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
575 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
576 */
577 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
578 && !(or_with->flags & ANYOF_FOLD)
579 && !(cl->flags & ANYOF_FOLD) ) {
580 int i;
581
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] |= ~or_with->bitmap[i];
584 } /* XXXX: logic is complicated otherwise */
585 else {
830247a4 586 cl_anything(pRExC_state, cl);
653099ff
GS
587 }
588 } else {
589 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
590 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 591 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
592 || (cl->flags & ANYOF_FOLD)) ) {
593 int i;
594
595 /* OR char bitmap and class bitmap separately */
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= or_with->bitmap[i];
598 if (or_with->flags & ANYOF_CLASS) {
599 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
600 cl->classflags[i] |= or_with->classflags[i];
601 cl->flags |= ANYOF_CLASS;
602 }
603 }
604 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 605 cl_anything(pRExC_state, cl);
653099ff
GS
606 }
607 }
608 if (or_with->flags & ANYOF_EOS)
609 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
610
611 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
612 ARG(cl) != ARG(or_with)) {
613 cl->flags |= ANYOF_UNICODE_ALL;
614 cl->flags &= ~ANYOF_UNICODE;
615 }
616 if (or_with->flags & ANYOF_UNICODE_ALL) {
617 cl->flags |= ANYOF_UNICODE_ALL;
618 cl->flags &= ~ANYOF_UNICODE;
619 }
653099ff
GS
620}
621
5d1c421c 622/*
a3621e74 623
3dab1dad 624 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
625 startbranch: the first branch in the whole branch sequence
626 first : start branch of sequence of branch-exact nodes.
627 May be the same as startbranch
628 last : Thing following the last branch.
629 May be the same as tail.
630 tail : item following the branch sequence
631 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 632 depth : indent depth
a3621e74
YO
633
634Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
635
636A trie is an N'ary tree where the branches are determined by digital
637decomposition of the key. IE, at the root node you look up the 1st character and
638follow that branch repeat until you find the end of the branches. Nodes can be
639marked as "accepting" meaning they represent a complete word. Eg:
640
641 /he|she|his|hers/
642
643would convert into the following structure. Numbers represent states, letters
644following numbers represent valid transitions on the letter from that state, if
645the number is in square brackets it represents an accepting state, otherwise it
646will be in parenthesis.
647
648 +-h->+-e->[3]-+-r->(8)-+-s->[9]
649 | |
650 | (2)
651 | |
652 (1) +-i->(6)-+-s->[7]
653 |
654 +-s->(3)-+-h->(4)-+-e->[5]
655
656 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
657
658This shows that when matching against the string 'hers' we will begin at state 1
659read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
660then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
661is also accepting. Thus we know that we can match both 'he' and 'hers' with a
662single traverse. We store a mapping from accepting to state to which word was
663matched, and then when we have multiple possibilities we try to complete the
664rest of the regex in the order in which they occured in the alternation.
665
666The only prior NFA like behaviour that would be changed by the TRIE support is
667the silent ignoring of duplicate alternations which are of the form:
668
669 / (DUPE|DUPE) X? (?{ ... }) Y /x
670
671Thus EVAL blocks follwing a trie may be called a different number of times with
672and without the optimisation. With the optimisations dupes will be silently
673ignored. This inconsistant behaviour of EVAL type nodes is well established as
674the following demonstrates:
675
676 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
677
678which prints out 'word' three times, but
679
680 'words'=~/(word|word|word)(?{ print $1 })S/
681
682which doesnt print it out at all. This is due to other optimisations kicking in.
683
684Example of what happens on a structural level:
685
686The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
687
688 1: CURLYM[1] {1,32767}(18)
689 5: BRANCH(8)
690 6: EXACT <ac>(16)
691 8: BRANCH(11)
692 9: EXACT <ad>(16)
693 11: BRANCH(14)
694 12: EXACT <ab>(16)
695 16: SUCCEED(0)
696 17: NOTHING(18)
697 18: END(0)
698
699This would be optimizable with startbranch=5, first=5, last=16, tail=16
700and should turn into:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: TRIE(16)
704 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
705 <ac>
706 <ad>
707 <ab>
708 16: SUCCEED(0)
709 17: NOTHING(18)
710 18: END(0)
711
712Cases where tail != last would be like /(?foo|bar)baz/:
713
714 1: BRANCH(4)
715 2: EXACT <foo>(8)
716 4: BRANCH(7)
717 5: EXACT <bar>(8)
718 7: TAIL(8)
719 8: EXACT <baz>(10)
720 10: END(0)
721
722which would be optimizable with startbranch=1, first=1, last=7, tail=8
723and would end up looking like:
724
725 1: TRIE(8)
726 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
727 <foo>
728 <bar>
729 7: TAIL(8)
730 8: EXACT <baz>(10)
731 10: END(0)
732
3dab1dad
YO
733 d = uvuni_to_utf8_flags(d, uv, 0);
734
735is the recommended Unicode-aware way of saying
736
737 *(d++) = uv;
a3621e74
YO
738*/
739
3dab1dad
YO
740#define TRIE_STORE_REVCHAR \
741 STMT_START { \
742 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
743 av_push( TRIE_REVCHARMAP(trie), tmp ); \
744 } STMT_END
a3621e74
YO
745
746#define TRIE_READ_CHAR STMT_START { \
747 if ( UTF ) { \
748 if ( folder ) { \
749 if ( foldlen > 0 ) { \
750 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
751 foldlen -= len; \
752 scan += len; \
753 len = 0; \
754 } else { \
e1ec3a88 755 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
756 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
757 foldlen -= UNISKIP( uvc ); \
758 scan = foldbuf + UNISKIP( uvc ); \
759 } \
760 } else { \
e1ec3a88 761 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
762 } \
763 } else { \
764 uvc = (U32)*uc; \
765 len = 1; \
766 } \
767} STMT_END
768
769
770#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
771#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
772#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
773#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
774
775#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
776 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
777 TRIE_LIST_LEN( state ) *= 2; \
778 Renew( trie->states[ state ].trans.list, \
779 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
780 } \
781 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
782 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
783 TRIE_LIST_CUR( state )++; \
784} STMT_END
785
786#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 787 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
788 4, reg_trie_trans_le ); \
789 TRIE_LIST_CUR( state ) = 1; \
790 TRIE_LIST_LEN( state ) = 4; \
791} STMT_END
792
3dab1dad
YO
793#define TRIE_HANDLE_WORD(state) STMT_START { \
794 if ( !trie->states[ state ].wordnum ) { \
795 /* we havent inserted this word into the structure yet. */\
796 trie->states[ state ].wordnum = ++curword; \
797 DEBUG_r({ \
798 /* store the word for dumping */ \
799 SV* tmp; \
800 if (OP(noper) != NOTHING ) \
801 tmp=newSVpvn( STRING( noper ), STR_LEN( noper ) );\
802 else \
803 tmp=newSVpvn( "", 0 ); \
804 if ( UTF ) SvUTF8_on( tmp ); \
805 av_push( trie->words, tmp ); \
806 }); \
807 } else { \
808 NOOP; /* It's a dupe. So ignore it. */ \
809 } \
810} STMT_END
811
812#ifdef DEBUGGING
813/*
814 dump_trie(trie)
815 dump_trie_interim_list(trie,next_alloc)
816 dump_trie_interim_table(trie,next_alloc)
817
818 These routines dump out a trie in a somewhat readable format.
819 The _interim_ variants are used for debugging the interim
820 tables that are used to generate the final compressed
821 representation which is what dump_trie expects.
822
823 Part of the reason for their existance is to provide a form
824 of documentation as to how the different representations function.
825
826*/
827
828/*
829 dump_trie(trie)
830 Dumps the final compressed table form of the trie to Perl_debug_log.
831 Used for debugging make_trie().
832*/
833
834STATIC void
835S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
836{
837 U32 state;
838 GET_RE_DEBUG_FLAGS_DECL;
839
840 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
841 (int)depth * 2 + 2,"",
842 "Match","Base","Ofs" );
843
844 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
845 SV **tmp = av_fetch( trie->revcharmap, state, 0);
846 if ( tmp ) {
847 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
848 }
849 }
850 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
851 (int)depth * 2 + 2,"");
852
853 for( state = 0 ; state < trie->uniquecharcount ; state++ )
854 PerlIO_printf( Perl_debug_log, "-----");
855 PerlIO_printf( Perl_debug_log, "\n");
856
857 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
858 const U32 base = trie->states[ state ].trans.base;
859
860 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
861
862 if ( trie->states[ state ].wordnum ) {
863 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
864 } else {
865 PerlIO_printf( Perl_debug_log, "%6s", "" );
866 }
867
868 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
869
870 if ( base ) {
871 U32 ofs = 0;
872
873 while( ( base + ofs < trie->uniquecharcount ) ||
874 ( base + ofs - trie->uniquecharcount < trie->lasttrans
875 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
876 ofs++;
877
878 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
879
880 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
881 if ( ( base + ofs >= trie->uniquecharcount ) &&
882 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
883 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
884 {
885 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
886 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
887 } else {
888 PerlIO_printf( Perl_debug_log, "%4s "," ." );
889 }
890 }
891
892 PerlIO_printf( Perl_debug_log, "]");
893
894 }
895 PerlIO_printf( Perl_debug_log, "\n" );
896 }
897}
898/*
899 dump_trie_interim_list(trie,next_alloc)
900 Dumps a fully constructed but uncompressed trie in list form.
901 List tries normally only are used for construction when the number of
902 possible chars (trie->uniquecharcount) is very high.
903 Used for debugging make_trie().
904*/
905STATIC void
906S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
907{
908 U32 state;
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
913 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
914
915 for( state=1 ; state < next_alloc ; state ++ ) {
916 U16 charid;
917
918 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
922 } else {
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
925 );
926 }
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
928 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
929 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
930 SvPV_nolen_const( *tmp ),
931 TRIE_LIST_ITEM(state,charid).forid,
932 (UV)TRIE_LIST_ITEM(state,charid).newstate
933 );
934 }
935
936 }
937}
938
939/*
940 dump_trie_interim_table(trie,next_alloc)
941 Dumps a fully constructed but uncompressed trie in table form.
942 This is the normal DFA style state transition table, with a few
943 twists to facilitate compression later.
944 Used for debugging make_trie().
945*/
946STATIC void
947S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
948{
949 U32 state;
950 U16 charid;
951 GET_RE_DEBUG_FLAGS_DECL;
952
953 /*
954 print out the table precompression so that we can do a visual check
955 that they are identical.
956 */
957
958 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
959
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
961 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
962 if ( tmp ) {
963 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
964 }
965 }
966
967 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
968
969 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
970 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
971 }
972
973 PerlIO_printf( Perl_debug_log, "\n" );
974
975 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
976
977 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
978 (int)depth * 2 + 2,"",
979 (UV)TRIE_NODENUM( state ) );
980
981 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
982 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
983 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
984 }
985 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
986 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
987 } else {
988 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
989 trie->states[ TRIE_NODENUM( state ) ].wordnum );
990 }
991 }
992}
993
994#endif
995
996
997
998
999
a3621e74 1000STATIC I32
3dab1dad 1001S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1002{
27da23d5 1003 dVAR;
a3621e74
YO
1004 /* first pass, loop through and scan words */
1005 reg_trie_data *trie;
1006 regnode *cur;
9f7f3913 1007 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1008 STRLEN len = 0;
1009 UV uvc = 0;
1010 U16 curword = 0;
1011 U32 next_alloc = 0;
1012 /* we just use folder as a flag in utf8 */
e1ec3a88 1013 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1014 ? PL_fold
1015 : ( flags == EXACTFL
1016 ? PL_fold_locale
1017 : NULL
1018 )
1019 );
1020
e1ec3a88 1021 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1022 SV *re_trie_maxbuff;
3dab1dad
YO
1023#ifndef DEBUGGING
1024 /* these are only used during construction but are useful during
1025 debugging so we store them in the struct when debugging.
1026 Wordcount is actually superfluous in debugging as we have
1027 (AV*)trie->words to use for it, but thats not available when
1028 not debugging... We could make the macro use the AV during
1029 debugging tho...
1030 */
1031 U16 trie_wordcount=0;
1032 STRLEN trie_charcount=0;
1033 U32 trie_laststate=0;
1034 AV *trie_revcharmap;
1035#endif
a3621e74
YO
1036 GET_RE_DEBUG_FLAGS_DECL;
1037
a02a5408 1038 Newxz( trie, 1, reg_trie_data );
a3621e74 1039 trie->refcount = 1;
3dab1dad 1040 trie->startstate = 1;
a3621e74 1041 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1042 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1043 if (!(UTF && folder))
1044 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1045 DEBUG_r({
1046 trie->words = newAV();
a3621e74 1047 });
3dab1dad 1048 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1049
0111c4fd 1050 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1051 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1052 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1053 }
3dab1dad
YO
1054 DEBUG_OPTIMISE_r({
1055 PerlIO_printf( Perl_debug_log,
1056 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1057 (int)depth * 2 + 2, "",
1058 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1059 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1060 });
a3621e74
YO
1061 /* -- First loop and Setup --
1062
1063 We first traverse the branches and scan each word to determine if it
1064 contains widechars, and how many unique chars there are, this is
1065 important as we have to build a table with at least as many columns as we
1066 have unique chars.
1067
1068 We use an array of integers to represent the character codes 0..255
1069 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1070 native representation of the character value as the key and IV's for the
1071 coded index.
1072
1073 *TODO* If we keep track of how many times each character is used we can
1074 remap the columns so that the table compression later on is more
1075 efficient in terms of memory by ensuring most common value is in the
1076 middle and the least common are on the outside. IMO this would be better
1077 than a most to least common mapping as theres a decent chance the most
1078 common letter will share a node with the least common, meaning the node
1079 will not be compressable. With a middle is most common approach the worst
1080 case is when we have the least common nodes twice.
1081
1082 */
1083
a3621e74 1084 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1085 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1086 const U8 *uc = (U8*)STRING( noper );
a28509cc 1087 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1088 STRLEN foldlen = 0;
1089 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1090 const U8 *scan = (U8*)NULL;
3dab1dad 1091 STRLEN chars=0;
a3621e74 1092
3dab1dad
YO
1093 TRIE_WORDCOUNT(trie)++;
1094 if (OP(noper) == NOTHING) {
1095 trie->minlen= 0;
1096 continue;
1097 }
1098 if (trie->bitmap) {
1099 TRIE_BITMAP_SET(trie,*uc);
1100 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1101 }
a3621e74 1102 for ( ; uc < e ; uc += len ) {
3dab1dad 1103 TRIE_CHARCOUNT(trie)++;
a3621e74 1104 TRIE_READ_CHAR;
3dab1dad 1105 chars++;
a3621e74
YO
1106 if ( uvc < 256 ) {
1107 if ( !trie->charmap[ uvc ] ) {
1108 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1109 if ( folder )
1110 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1111 TRIE_STORE_REVCHAR;
a3621e74
YO
1112 }
1113 } else {
1114 SV** svpp;
1115 if ( !trie->widecharmap )
1116 trie->widecharmap = newHV();
1117
1118 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1119
1120 if ( !svpp )
e4584336 1121 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1122
1123 if ( !SvTRUE( *svpp ) ) {
1124 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1125 TRIE_STORE_REVCHAR;
a3621e74
YO
1126 }
1127 }
1128 }
3dab1dad
YO
1129 if( cur == first ) {
1130 trie->minlen=chars;
1131 trie->maxlen=chars;
1132 } else if (chars < trie->minlen) {
1133 trie->minlen=chars;
1134 } else if (chars > trie->maxlen) {
1135 trie->maxlen=chars;
1136 }
1137
a3621e74
YO
1138 } /* end first pass */
1139 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1140 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1141 (int)depth * 2 + 2,"",
1142 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1143 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
a3621e74
YO
1144 );
1145
1146
1147 /*
1148 We now know what we are dealing with in terms of unique chars and
1149 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1150 representation using a flat table will take. If it's over a reasonable
1151 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1152 conservative but potentially much slower representation using an array
1153 of lists.
1154
1155 At the end we convert both representations into the same compressed
1156 form that will be used in regexec.c for matching with. The latter
1157 is a form that cannot be used to construct with but has memory
1158 properties similar to the list form and access properties similar
1159 to the table form making it both suitable for fast searches and
1160 small enough that its feasable to store for the duration of a program.
1161
1162 See the comment in the code where the compressed table is produced
1163 inplace from the flat tabe representation for an explanation of how
1164 the compression works.
1165
1166 */
1167
1168
3dab1dad 1169 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1170 /*
1171 Second Pass -- Array Of Lists Representation
1172
1173 Each state will be represented by a list of charid:state records
1174 (reg_trie_trans_le) the first such element holds the CUR and LEN
1175 points of the allocated array. (See defines above).
1176
1177 We build the initial structure using the lists, and then convert
1178 it into the compressed table form which allows faster lookups
1179 (but cant be modified once converted).
a3621e74
YO
1180 */
1181
a3621e74
YO
1182 STRLEN transcount = 1;
1183
3dab1dad 1184 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1185 TRIE_LIST_NEW(1);
1186 next_alloc = 2;
1187
1188 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1189
c445ea15
AL
1190 regnode * const noper = NEXTOPER( cur );
1191 U8 *uc = (U8*)STRING( noper );
1192 const U8 * const e = uc + STR_LEN( noper );
1193 U32 state = 1; /* required init */
1194 U16 charid = 0; /* sanity init */
1195 U8 *scan = (U8*)NULL; /* sanity init */
1196 STRLEN foldlen = 0; /* required init */
1197 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1198
3dab1dad 1199 if (OP(noper) != NOTHING) {
c445ea15
AL
1200 for ( ; uc < e ; uc += len ) {
1201
1202 TRIE_READ_CHAR;
1203
1204 if ( uvc < 256 ) {
1205 charid = trie->charmap[ uvc ];
1206 } else {
1207 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1208 if ( !svpp ) {
1209 charid = 0;
1210 } else {
1211 charid=(U16)SvIV( *svpp );
1212 }
1213 }
1214 if ( charid ) {
a3621e74 1215
c445ea15
AL
1216 U16 check;
1217 U32 newstate = 0;
a3621e74 1218
c445ea15
AL
1219 charid--;
1220 if ( !trie->states[ state ].trans.list ) {
1221 TRIE_LIST_NEW( state );
1222 }
1223 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1224 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1225 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1226 break;
1227 }
1228 }
1229 if ( ! newstate ) {
1230 newstate = next_alloc++;
1231 TRIE_LIST_PUSH( state, charid, newstate );
1232 transcount++;
1233 }
1234 state = newstate;
1235 } else {
1236 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1237 }
c445ea15
AL
1238 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1239 }
c445ea15 1240 }
3dab1dad 1241 TRIE_HANDLE_WORD(state);
a3621e74
YO
1242
1243 } /* end second pass */
1244
3dab1dad 1245 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1246 Renew( trie->states, next_alloc, reg_trie_state );
1247
3dab1dad
YO
1248 /* and now dump it out before we compress it */
1249 DEBUG_TRIE_COMPILE_MORE_r(
1250 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1251 );
a3621e74 1252
a02a5408 1253 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1254 {
1255 U32 state;
a3621e74
YO
1256 U32 tp = 0;
1257 U32 zp = 0;
1258
1259
1260 for( state=1 ; state < next_alloc ; state ++ ) {
1261 U32 base=0;
1262
1263 /*
1264 DEBUG_TRIE_COMPILE_MORE_r(
1265 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1266 );
1267 */
1268
1269 if (trie->states[state].trans.list) {
1270 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1271 U16 maxid=minid;
a28509cc 1272 U16 idx;
a3621e74
YO
1273
1274 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1275 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1276 if ( forid < minid ) {
1277 minid=forid;
1278 } else if ( forid > maxid ) {
1279 maxid=forid;
1280 }
a3621e74
YO
1281 }
1282 if ( transcount < tp + maxid - minid + 1) {
1283 transcount *= 2;
1284 Renew( trie->trans, transcount, reg_trie_trans );
1285 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1286 }
1287 base = trie->uniquecharcount + tp - minid;
1288 if ( maxid == minid ) {
1289 U32 set = 0;
1290 for ( ; zp < tp ; zp++ ) {
1291 if ( ! trie->trans[ zp ].next ) {
1292 base = trie->uniquecharcount + zp - minid;
1293 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1294 trie->trans[ zp ].check = state;
1295 set = 1;
1296 break;
1297 }
1298 }
1299 if ( !set ) {
1300 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1301 trie->trans[ tp ].check = state;
1302 tp++;
1303 zp = tp;
1304 }
1305 } else {
1306 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1307 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1308 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1309 trie->trans[ tid ].check = state;
1310 }
1311 tp += ( maxid - minid + 1 );
1312 }
1313 Safefree(trie->states[ state ].trans.list);
1314 }
1315 /*
1316 DEBUG_TRIE_COMPILE_MORE_r(
1317 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1318 );
1319 */
1320 trie->states[ state ].trans.base=base;
1321 }
cc601c31 1322 trie->lasttrans = tp + 1;
a3621e74
YO
1323 }
1324 } else {
1325 /*
1326 Second Pass -- Flat Table Representation.
1327
1328 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1329 We know that we will need Charcount+1 trans at most to store the data
1330 (one row per char at worst case) So we preallocate both structures
1331 assuming worst case.
1332
1333 We then construct the trie using only the .next slots of the entry
1334 structs.
1335
1336 We use the .check field of the first entry of the node temporarily to
1337 make compression both faster and easier by keeping track of how many non
1338 zero fields are in the node.
1339
1340 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1341 transition.
1342
1343 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1344 number representing the first entry of the node, and state as a
1345 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1346 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1347 are 2 entrys per node. eg:
1348
1349 A B A B
1350 1. 2 4 1. 3 7
1351 2. 0 3 3. 0 5
1352 3. 0 0 5. 0 0
1353 4. 0 0 7. 0 0
1354
1355 The table is internally in the right hand, idx form. However as we also
1356 have to deal with the states array which is indexed by nodenum we have to
1357 use TRIE_NODENUM() to convert.
1358
1359 */
1360
3dab1dad
YO
1361
1362 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1363 reg_trie_trans );
3dab1dad 1364 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1365 next_alloc = trie->uniquecharcount + 1;
1366
3dab1dad 1367
a3621e74
YO
1368 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1369
c445ea15 1370 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1371 const U8 *uc = (U8*)STRING( noper );
1372 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1373
1374 U32 state = 1; /* required init */
1375
1376 U16 charid = 0; /* sanity init */
1377 U32 accept_state = 0; /* sanity init */
1378 U8 *scan = (U8*)NULL; /* sanity init */
1379
1380 STRLEN foldlen = 0; /* required init */
1381 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1382
3dab1dad 1383 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1384 for ( ; uc < e ; uc += len ) {
1385
1386 TRIE_READ_CHAR;
1387
1388 if ( uvc < 256 ) {
1389 charid = trie->charmap[ uvc ];
1390 } else {
c445ea15
AL
1391 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1392 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1393 }
1394 if ( charid ) {
1395 charid--;
1396 if ( !trie->trans[ state + charid ].next ) {
1397 trie->trans[ state + charid ].next = next_alloc;
1398 trie->trans[ state ].check++;
1399 next_alloc += trie->uniquecharcount;
1400 }
1401 state = trie->trans[ state + charid ].next;
1402 } else {
e4584336 1403 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1404 }
1405 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1406 }
a3621e74 1407 }
3dab1dad
YO
1408 accept_state = TRIE_NODENUM( state );
1409 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1410
1411 } /* end second pass */
1412
3dab1dad
YO
1413 /* and now dump it out before we compress it */
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 dump_trie_interim_table(trie,next_alloc,depth+1)
1416 );
a3621e74 1417
a3621e74
YO
1418 {
1419 /*
1420 * Inplace compress the table.*
1421
1422 For sparse data sets the table constructed by the trie algorithm will
1423 be mostly 0/FAIL transitions or to put it another way mostly empty.
1424 (Note that leaf nodes will not contain any transitions.)
1425
1426 This algorithm compresses the tables by eliminating most such
1427 transitions, at the cost of a modest bit of extra work during lookup:
1428
1429 - Each states[] entry contains a .base field which indicates the
1430 index in the state[] array wheres its transition data is stored.
1431
1432 - If .base is 0 there are no valid transitions from that node.
1433
1434 - If .base is nonzero then charid is added to it to find an entry in
1435 the trans array.
1436
1437 -If trans[states[state].base+charid].check!=state then the
1438 transition is taken to be a 0/Fail transition. Thus if there are fail
1439 transitions at the front of the node then the .base offset will point
1440 somewhere inside the previous nodes data (or maybe even into a node
1441 even earlier), but the .check field determines if the transition is
1442 valid.
1443
1444 The following process inplace converts the table to the compressed
1445 table: We first do not compress the root node 1,and mark its all its
1446 .check pointers as 1 and set its .base pointer as 1 as well. This
1447 allows to do a DFA construction from the compressed table later, and
1448 ensures that any .base pointers we calculate later are greater than
1449 0.
1450
1451 - We set 'pos' to indicate the first entry of the second node.
1452
1453 - We then iterate over the columns of the node, finding the first and
1454 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1455 and set the .check pointers accordingly, and advance pos
1456 appropriately and repreat for the next node. Note that when we copy
1457 the next pointers we have to convert them from the original
1458 NODEIDX form to NODENUM form as the former is not valid post
1459 compression.
1460
1461 - If a node has no transitions used we mark its base as 0 and do not
1462 advance the pos pointer.
1463
1464 - If a node only has one transition we use a second pointer into the
1465 structure to fill in allocated fail transitions from other states.
1466 This pointer is independent of the main pointer and scans forward
1467 looking for null transitions that are allocated to a state. When it
1468 finds one it writes the single transition into the "hole". If the
1469 pointer doesnt find one the single transition is appeneded as normal.
1470
1471 - Once compressed we can Renew/realloc the structures to release the
1472 excess space.
1473
1474 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1475 specifically Fig 3.47 and the associated pseudocode.
1476
1477 demq
1478 */
a3b680e6 1479 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1480 U32 state, charid;
a3621e74 1481 U32 pos = 0, zp=0;
3dab1dad 1482 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1483
1484 for ( state = 1 ; state < laststate ; state++ ) {
1485 U8 flag = 0;
a28509cc
AL
1486 const U32 stateidx = TRIE_NODEIDX( state );
1487 const U32 o_used = trie->trans[ stateidx ].check;
1488 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1489 trie->trans[ stateidx ].check = 0;
1490
1491 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1492 if ( flag || trie->trans[ stateidx + charid ].next ) {
1493 if ( trie->trans[ stateidx + charid ].next ) {
1494 if (o_used == 1) {
1495 for ( ; zp < pos ; zp++ ) {
1496 if ( ! trie->trans[ zp ].next ) {
1497 break;
1498 }
1499 }
1500 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1501 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1502 trie->trans[ zp ].check = state;
1503 if ( ++zp > pos ) pos = zp;
1504 break;
1505 }
1506 used--;
1507 }
1508 if ( !flag ) {
1509 flag = 1;
1510 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1511 }
1512 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1513 trie->trans[ pos ].check = state;
1514 pos++;
1515 }
1516 }
1517 }
cc601c31 1518 trie->lasttrans = pos + 1;
a3621e74
YO
1519 Renew( trie->states, laststate + 1, reg_trie_state);
1520 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1521 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1522 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1523 (int)depth * 2 + 2,"",
1524 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1525 (IV)next_alloc,
1526 (IV)pos,
a3621e74
YO
1527 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1528 );
1529
1530 } /* end table compress */
1531 }
cc601c31
YO
1532 /* resize the trans array to remove unused space */
1533 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1534
3dab1dad
YO
1535 /* and now dump out the compressed format */
1536 DEBUG_TRIE_COMPILE_r(
1537 dump_trie(trie,depth+1)
1538 );
1539
1540 { /* Modify the program and insert the new TRIE node*/
1541 regnode *convert;
1542 U8 nodetype =(U8)(flags & 0xFF);
1543 char *str=NULL;
a3621e74 1544 /*
3dab1dad
YO
1545 This means we convert either the first branch or the first Exact,
1546 depending on whether the thing following (in 'last') is a branch
1547 or not and whther first is the startbranch (ie is it a sub part of
1548 the alternation or is it the whole thing.)
1549 Assuming its a sub part we conver the EXACT otherwise we convert
1550 the whole branch sequence, including the first.
a3621e74 1551 */
3dab1dad
YO
1552 /* Find the node we are going to overwrite */
1553 if ( first == startbranch && OP( last ) != BRANCH ) {
1554 convert = first;
a3621e74 1555 } else {
3dab1dad
YO
1556 convert = NEXTOPER( first );
1557 NEXT_OFF( first ) = (U16)(last - first);
a3621e74
YO
1558 }
1559
3dab1dad
YO
1560 /* But first we check to see if there is a common prefix we can
1561 split out as an EXACT and put in front of the TRIE node. */
1562 trie->startstate= 1;
1563 if ( trie->bitmap && !trie->widecharmap ) {
1564 U32 state;
1565 DEBUG_OPTIMISE_r(
1566 PerlIO_printf( Perl_debug_log,"%*sLaststate:%d\n",
1567 (int)depth * 2 + 2,"",
1568 TRIE_LASTSTATE(trie)));
1569 for( state= 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1570 U32 ofs = 0;
3dab1dad
YO
1571 I32 idx= -1;
1572 U32 count= 0;
1573 const U32 base= trie->states[ state ].trans.base;
a3621e74 1574
3dab1dad
YO
1575 if ( trie->states[state].wordnum )
1576 count =1;
a3621e74 1577
3dab1dad
YO
1578 for ( ofs= 0 ; ofs < trie->uniquecharcount ; ofs++ )
1579 {
a3621e74 1580
cc601c31
YO
1581 if ( ( base + ofs >= trie->uniquecharcount ) &&
1582 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1583 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1584 {
3dab1dad
YO
1585 if ( ++count > 1 ) {
1586 SV **tmp= av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1587 const char *ch= SvPV_nolen_const( *tmp );
1588 if (state==1) break;
1589 if ( count == 2 ) {
1590 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1591 DEBUG_OPTIMISE_r(
1592 PerlIO_printf( Perl_debug_log,"%*sNew Start State=%d Class: [",
1593 (int)depth * 2 + 2,"",
1594 state));
1595 if (idx>-1) {
1596 SV **tmp= av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1597 const char *ch= SvPV_nolen_const( *tmp );
1598
1599 TRIE_BITMAP_SET(trie,*ch);
1600 if ( folder )
1601 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1602 DEBUG_OPTIMISE_r(
1603 PerlIO_printf( Perl_debug_log,"%s", ch)
1604 );
a3621e74
YO
1605 }
1606 }
3dab1dad
YO
1607 TRIE_BITMAP_SET(trie,*ch);
1608 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *ch ]);
1609 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
a3621e74 1610 }
3dab1dad 1611 idx= ofs;
a3621e74 1612 }
3dab1dad
YO
1613 }
1614 if ( count == 1 ) {
1615 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1616 const char *ch= SvPV_nolen_const( *tmp );
1617 DEBUG_OPTIMISE_r(
1618 PerlIO_printf( Perl_debug_log,"%*sPrefix State: %d Idx:%d Char='%s'\n",
1619 (int)depth * 2 + 2,"",
1620 state, idx, ch)
1621 );
1622 if ( state==1 ) {
1623 OP( convert ) = nodetype;
1624 str=STRING(convert);
1625 STR_LEN(convert)=0;
1626 }
1627 *str++=*ch;
1628 STR_LEN(convert)++;
a3621e74 1629
a3621e74 1630 } else {
3dab1dad
YO
1631 if (state>1)
1632 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1633 break;
a3621e74 1634 }
3dab1dad
YO
1635 }
1636 if (str) {
1637 regnode *n= convert+NODE_SZ_STR(convert);
1638 NEXT_OFF(convert)= NODE_SZ_STR(convert);
1639 trie->startstate= state;
1640 trie->minlen-= (state-1);
1641 trie->maxlen-= (state-1);
1642 if (trie->maxlen)
1643 convert= n;
1644 else {
1645 NEXT_OFF(convert) = (U16)(tail - convert);
1646 }
1647 }
1648 }
1649 if ( trie->maxlen ) {
1650 OP( convert ) = TRIE;
a3621e74
YO
1651 NEXT_OFF( convert ) = (U16)(tail - convert);
1652 ARG_SET( convert, data_slot );
1653
3dab1dad
YO
1654 /* store the type in the flags */
1655 convert->flags = nodetype;
1656 /* XXX We really should free up the resource in trie now, as we wont use them */
1657 }
a3621e74
YO
1658 /* needed for dumping*/
1659 DEBUG_r({
1660 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1661 /* We now need to mark all of the space originally used by the
1662 branches as optimized away. This keeps the dumpuntil from
1663 throwing a wobbly as it doesnt use regnext() to traverse the
1664 opcodes.
1665 */
1666 while( optimize < last ) {
1667 OP( optimize ) = OPTIMIZED;
1668 optimize++;
1669 }
1670 });
1671 } /* end node insert */
1672 return 1;
1673}
1674
a3621e74 1675/*
5d1c421c
JH
1676 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1677 * These need to be revisited when a newer toolchain becomes available.
1678 */
1679#if defined(__sparc64__) && defined(__GNUC__)
1680# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1681# undef SPARC64_GCC_WORKAROUND
1682# define SPARC64_GCC_WORKAROUND 1
1683# endif
1684#endif
1685
653099ff
GS
1686/* REx optimizer. Converts nodes into quickier variants "in place".
1687 Finds fixed substrings. */
1688
a0288114 1689/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1690 to the position after last scanned or to NULL. */
1691
76e3520e 1692STATIC I32
9a957fbc
AL
1693S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1694 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1695 /* scanp: Start here (read-write). */
1696 /* deltap: Write maxlen-minlen here. */
1697 /* last: Stop before this one. */
1698{
97aff369 1699 dVAR;
c277df42
IZ
1700 I32 min = 0, pars = 0, code;
1701 regnode *scan = *scanp, *next;
1702 I32 delta = 0;
1703 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1704 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1705 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1706 scan_data_t data_fake;
653099ff 1707 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1708 SV *re_trie_maxbuff = NULL;
1709
1710 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1711
3dab1dad 1712PEEP:
c277df42 1713 while (scan && OP(scan) != END && scan < last) {
3dab1dad
YO
1714 #ifdef DEBUGGING
1715 int merged=0;
1716 #endif
c277df42 1717 /* Peephole optimizer: */
a3621e74 1718 DEBUG_OPTIMISE_r({
c445ea15 1719 SV * const mysv=sv_newmortal();
32fc9b6a 1720 regprop(RExC_rx, mysv, scan);
3dab1dad
YO
1721 PerlIO_printf(Perl_debug_log, "%*s%4s~ %s (%d)\n",
1722 (int)depth*2, "",
1723 scan==*scanp ? "Peep" : "",
1724 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
a3621e74 1725 });
3dab1dad 1726 if (PL_regkind[OP(scan)] == EXACT) {
653099ff 1727 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1728 regnode *n = regnext(scan);
1729 U32 stringok = 1;
1730#ifdef DEBUGGING
1731 regnode *stop = scan;
b81d288d 1732#endif
cd439c50 1733 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1734 /* Skip NOTHING, merge EXACT*. */
1735 while (n &&
3dab1dad 1736 ( PL_regkind[OP(n)] == NOTHING ||
c277df42
IZ
1737 (stringok && (OP(n) == OP(scan))))
1738 && NEXT_OFF(n)
1739 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1740 if (OP(n) == TAIL || n > next)
1741 stringok = 0;
3dab1dad
YO
1742 if (PL_regkind[OP(n)] == NOTHING) {
1743 DEBUG_OPTIMISE_r({
1744 SV * const mysv=sv_newmortal();
1745 regprop(RExC_rx, mysv, n);
1746 PerlIO_printf(Perl_debug_log, "%*sskip: %s (%d)\n",
1747 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1748 });
c277df42
IZ
1749 NEXT_OFF(scan) += NEXT_OFF(n);
1750 next = n + NODE_STEP_REGNODE;
1751#ifdef DEBUGGING
1752 if (stringok)
1753 stop = n;
b81d288d 1754#endif
c277df42 1755 n = regnext(n);
a0ed51b3 1756 }
f49d4d0f 1757 else if (stringok) {
a3b680e6 1758 const int oldl = STR_LEN(scan);
c445ea15 1759 regnode * const nnext = regnext(n);
3dab1dad
YO
1760 DEBUG_OPTIMISE_r({
1761 SV * const mysv=sv_newmortal();
1762 regprop(RExC_rx, mysv, n);
1763 PerlIO_printf(Perl_debug_log, "%*s mrg: %s (%d)\n",
1764 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1765 merged++;
1766 });
b81d288d 1767 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1768 break;
1769 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1770 STR_LEN(scan) += STR_LEN(n);
1771 next = n + NODE_SZ_STR(n);
c277df42 1772 /* Now we can overwrite *n : */
f49d4d0f 1773 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1774#ifdef DEBUGGING
f49d4d0f 1775 stop = next - 1;
b81d288d 1776#endif
c277df42
IZ
1777 n = nnext;
1778 }
1779 }
61a36c01 1780
a3621e74 1781 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1782/*
1783 Two problematic code points in Unicode casefolding of EXACT nodes:
1784
1785 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1786 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1787
1788 which casefold to
1789
1790 Unicode UTF-8
1791
1792 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1793 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1794
1795 This means that in case-insensitive matching (or "loose matching",
1796 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1797 length of the above casefolded versions) can match a target string
1798 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1799 This would rather mess up the minimum length computation.
1800
1801 What we'll do is to look for the tail four bytes, and then peek
1802 at the preceding two bytes to see whether we need to decrease
1803 the minimum length by four (six minus two).
1804
1805 Thanks to the design of UTF-8, there cannot be false matches:
1806 A sequence of valid UTF-8 bytes cannot be a subsequence of
1807 another valid sequence of UTF-8 bytes.
1808
1809*/
c445ea15
AL
1810 char * const s0 = STRING(scan), *s, *t;
1811 char * const s1 = s0 + STR_LEN(scan) - 1;
1812 char * const s2 = s1 - 4;
d4c19fe8 1813 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1814 const char * const t1 = t0 + 3;
2af232bd 1815
61a36c01
JH
1816 for (s = s0 + 2;
1817 s < s2 && (t = ninstr(s, s1, t0, t1));
1818 s = t + 4) {
1819 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1820 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1821 min -= 4;
1822 }
1823 }
1824
c277df42
IZ
1825#ifdef DEBUGGING
1826 /* Allow dumping */
cd439c50 1827 n = scan + NODE_SZ_STR(scan);
c277df42 1828 while (n <= stop) {
3dab1dad 1829 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1830 OP(n) = OPTIMIZED;
1831 NEXT_OFF(n) = 0;
1832 }
1833 n++;
1834 }
653099ff 1835#endif
c277df42 1836 }
a3621e74
YO
1837
1838
1839
653099ff
GS
1840 /* Follow the next-chain of the current node and optimize
1841 away all the NOTHINGs from it. */
c277df42 1842 if (OP(scan) != CURLYX) {
a3b680e6 1843 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1844 ? I32_MAX
1845 /* I32 may be smaller than U16 on CRAYs! */
1846 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1847 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1848 int noff;
1849 regnode *n = scan;
b81d288d 1850
c277df42
IZ
1851 /* Skip NOTHING and LONGJMP. */
1852 while ((n = regnext(n))
3dab1dad 1853 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1854 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1855 && off + noff < max)
1856 off += noff;
1857 if (reg_off_by_arg[OP(scan)])
1858 ARG(scan) = off;
b81d288d 1859 else
c277df42
IZ
1860 NEXT_OFF(scan) = off;
1861 }
a3621e74 1862
3dab1dad
YO
1863 DEBUG_OPTIMISE_r({if (merged){
1864 SV * const mysv=sv_newmortal();
1865 regprop(RExC_rx, mysv, scan);
1866 PerlIO_printf(Perl_debug_log, "%*s res: %s (%d)\n",
1867 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
1868 }});
1869
653099ff
GS
1870 /* The principal pseudo-switch. Cannot be a switch, since we
1871 look into several different things. */
b81d288d 1872 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1873 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1874 next = regnext(scan);
1875 code = OP(scan);
a3621e74 1876 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1877
1878 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1879 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1880 struct regnode_charclass_class accum;
d4c19fe8 1881 regnode * const startbranch=scan;
c277df42 1882
653099ff 1883 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1884 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1885 if (flags & SCF_DO_STCLASS)
830247a4 1886 cl_init_zero(pRExC_state, &accum);
a3621e74 1887
c277df42 1888 while (OP(scan) == code) {
830247a4 1889 I32 deltanext, minnext, f = 0, fake;
653099ff 1890 struct regnode_charclass_class this_class;
c277df42
IZ
1891
1892 num++;
1893 data_fake.flags = 0;
b81d288d 1894 if (data) {
2c2d71f5 1895 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1896 data_fake.last_closep = data->last_closep;
1897 }
1898 else
1899 data_fake.last_closep = &fake;
c277df42
IZ
1900 next = regnext(scan);
1901 scan = NEXTOPER(scan);
1902 if (code != BRANCH)
1903 scan = NEXTOPER(scan);
653099ff 1904 if (flags & SCF_DO_STCLASS) {
830247a4 1905 cl_init(pRExC_state, &this_class);
653099ff
GS
1906 data_fake.start_class = &this_class;
1907 f = SCF_DO_STCLASS_AND;
b81d288d 1908 }
e1901655
IZ
1909 if (flags & SCF_WHILEM_VISITED_POS)
1910 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1911
653099ff 1912 /* we suppose the run is continuous, last=next...*/
830247a4 1913 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1914 next, &data_fake, f,depth+1);
b81d288d 1915 if (min1 > minnext)
c277df42
IZ
1916 min1 = minnext;
1917 if (max1 < minnext + deltanext)
1918 max1 = minnext + deltanext;
1919 if (deltanext == I32_MAX)
aca2d497 1920 is_inf = is_inf_internal = 1;
c277df42
IZ
1921 scan = next;
1922 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1923 pars++;
3dab1dad
YO
1924 if (data) {
1925 if (data_fake.flags & SF_HAS_EVAL)
c277df42 1926 data->flags |= SF_HAS_EVAL;
2c2d71f5 1927 data->whilem_c = data_fake.whilem_c;
3dab1dad 1928 }
653099ff 1929 if (flags & SCF_DO_STCLASS)
830247a4 1930 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1931 if (code == SUSPEND)
c277df42
IZ
1932 break;
1933 }
1934 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1935 min1 = 0;
1936 if (flags & SCF_DO_SUBSTR) {
1937 data->pos_min += min1;
1938 data->pos_delta += max1 - min1;
1939 if (max1 != min1 || is_inf)
1940 data->longest = &(data->longest_float);
1941 }
1942 min += min1;
1943 delta += max1 - min1;
653099ff 1944 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1945 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1946 if (min1) {
1947 cl_and(data->start_class, &and_with);
1948 flags &= ~SCF_DO_STCLASS;
1949 }
1950 }
1951 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1952 if (min1) {
1953 cl_and(data->start_class, &accum);
653099ff 1954 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1955 }
1956 else {
b81d288d 1957 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1958 * data->start_class */
1959 StructCopy(data->start_class, &and_with,
1960 struct regnode_charclass_class);
1961 flags &= ~SCF_DO_STCLASS_AND;
1962 StructCopy(&accum, data->start_class,
1963 struct regnode_charclass_class);
1964 flags |= SCF_DO_STCLASS_OR;
1965 data->start_class->flags |= ANYOF_EOS;
1966 }
653099ff 1967 }
a3621e74
YO
1968
1969 /* demq.
1970
1971 Assuming this was/is a branch we are dealing with: 'scan' now
1972 points at the item that follows the branch sequence, whatever
1973 it is. We now start at the beginning of the sequence and look
1974 for subsequences of
1975
1976 BRANCH->EXACT=>X
1977 BRANCH->EXACT=>X
1978
1979 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1980
1981 If we can find such a subseqence we need to turn the first
1982 element into a trie and then add the subsequent branch exact
1983 strings to the trie.
1984
1985 We have two cases
1986
1987 1. patterns where the whole set of branch can be converted to a trie,
1988
1989 2. patterns where only a subset of the alternations can be
1990 converted to a trie.
1991
1992 In case 1 we can replace the whole set with a single regop
1993 for the trie. In case 2 we need to keep the start and end
1994 branchs so
1995
1996 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1997 becomes BRANCH TRIE; BRANCH X;
1998
1999 Hypthetically when we know the regex isnt anchored we can
2000 turn a case 1 into a DFA and let it rip... Every time it finds a match
2001 it would just call its tail, no WHILEM/CURLY needed.
2002
2003 */
0111c4fd 2004 if (DO_TRIE) {
3dab1dad 2005 int made=0;
0111c4fd
RGS
2006 if (!re_trie_maxbuff) {
2007 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2008 if (!SvIOK(re_trie_maxbuff))
2009 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2010 }
a3621e74
YO
2011 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2012 regnode *cur;
2013 regnode *first = (regnode *)NULL;
2014 regnode *last = (regnode *)NULL;
2015 regnode *tail = scan;
2016 U8 optype = 0;
2017 U32 count=0;
2018
2019#ifdef DEBUGGING
c445ea15 2020 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2021#endif
2022 /* var tail is used because there may be a TAIL
2023 regop in the way. Ie, the exacts will point to the
2024 thing following the TAIL, but the last branch will
2025 point at the TAIL. So we advance tail. If we
2026 have nested (?:) we may have to move through several
2027 tails.
2028 */
2029
2030 while ( OP( tail ) == TAIL ) {
2031 /* this is the TAIL generated by (?:) */
2032 tail = regnext( tail );
2033 }
2034
3dab1dad 2035
a3621e74 2036 DEBUG_OPTIMISE_r({
32fc9b6a 2037 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2038 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2039 (int)depth * 2 + 2, "",
2040 "Looking for TRIE'able sequences. Tail node is: ",
2041 SvPV_nolen_const( mysv )
a3621e74
YO
2042 );
2043 });
3dab1dad 2044
a3621e74
YO
2045 /*
2046
2047 step through the branches, cur represents each
2048 branch, noper is the first thing to be matched
2049 as part of that branch and noper_next is the
2050 regnext() of that node. if noper is an EXACT
2051 and noper_next is the same as scan (our current
2052 position in the regex) then the EXACT branch is
2053 a possible optimization target. Once we have
2054 two or more consequetive such branches we can
2055 create a trie of the EXACT's contents and stich
2056 it in place. If the sequence represents all of
2057 the branches we eliminate the whole thing and
2058 replace it with a single TRIE. If it is a
2059 subsequence then we need to stitch it in. This
2060 means the first branch has to remain, and needs
2061 to be repointed at the item on the branch chain
2062 following the last branch optimized. This could
2063 be either a BRANCH, in which case the
2064 subsequence is internal, or it could be the
2065 item following the branch sequence in which
2066 case the subsequence is at the end.
2067
2068 */
2069
2070 /* dont use tail as the end marker for this traverse */
2071 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2072 regnode * const noper = NEXTOPER( cur );
2073 regnode * const noper_next = regnext( noper );
a3621e74 2074
a3621e74 2075 DEBUG_OPTIMISE_r({
32fc9b6a 2076 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2077 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2078 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2079
32fc9b6a 2080 regprop(RExC_rx, mysv, noper);
a3621e74 2081 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2082 SvPV_nolen_const(mysv));
a3621e74
YO
2083
2084 if ( noper_next ) {
32fc9b6a 2085 regprop(RExC_rx, mysv, noper_next );
a3621e74 2086 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2087 SvPV_nolen_const(mysv));
a3621e74 2088 }
3dab1dad
YO
2089 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2090 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2091 });
3dab1dad
YO
2092 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2093 : PL_regkind[ OP( noper ) ] == EXACT )
2094 || OP(noper) == NOTHING )
a3621e74
YO
2095 && noper_next == tail && count<U16_MAX)
2096 {
2097 count++;
3dab1dad
YO
2098 if ( !first || optype == NOTHING ) {
2099 if (!first) first = cur;
a3621e74
YO
2100 optype = OP( noper );
2101 } else {
a3621e74 2102 last = cur;
a3621e74
YO
2103 }
2104 } else {
2105 if ( last ) {
3dab1dad 2106 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2107 }
3dab1dad 2108 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2109 && noper_next == tail )
2110 {
2111 count = 1;
2112 first = cur;
2113 optype = OP( noper );
2114 } else {
2115 count = 0;
2116 first = NULL;
2117 optype = 0;
2118 }
2119 last = NULL;
2120 }
2121 }
2122 DEBUG_OPTIMISE_r({
32fc9b6a 2123 regprop(RExC_rx, mysv, cur);
a3621e74 2124 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2125 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2126 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2127
2128 });
2129 if ( last ) {
3dab1dad
YO
2130 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2131#ifdef TRIE_STUDY_OPT
2132 if ( OP(first)!=TRIE && startbranch == first ) {
2133
a3621e74 2134 }
3dab1dad 2135#endif
a3621e74
YO
2136 }
2137 }
3dab1dad
YO
2138
2139 } /* do trie */
a0ed51b3 2140 }
a3621e74 2141 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2142 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2143 } else /* single branch is optimized. */
c277df42
IZ
2144 scan = NEXTOPER(scan);
2145 continue;
a0ed51b3
LW
2146 }
2147 else if (OP(scan) == EXACT) {
cd439c50 2148 I32 l = STR_LEN(scan);
c445ea15 2149 UV uc;
a0ed51b3 2150 if (UTF) {
a3b680e6 2151 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2152 l = utf8_length(s, s + l);
9041c2e3 2153 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2154 } else {
2155 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2156 }
2157 min += l;
c277df42 2158 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2159 /* The code below prefers earlier match for fixed
2160 offset, later match for variable offset. */
2161 if (data->last_end == -1) { /* Update the start info. */
2162 data->last_start_min = data->pos_min;
2163 data->last_start_max = is_inf
b81d288d 2164 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2165 }
cd439c50 2166 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2167 if (UTF)
2168 SvUTF8_on(data->last_found);
0eda9292 2169 {
9a957fbc 2170 SV * const sv = data->last_found;
a28509cc 2171 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2172 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2173 if (mg && mg->mg_len >= 0)
5e43f467
JH
2174 mg->mg_len += utf8_length((U8*)STRING(scan),
2175 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2176 }
c277df42
IZ
2177 data->last_end = data->pos_min + l;
2178 data->pos_min += l; /* As in the first entry. */
2179 data->flags &= ~SF_BEFORE_EOL;
2180 }
653099ff
GS
2181 if (flags & SCF_DO_STCLASS_AND) {
2182 /* Check whether it is compatible with what we know already! */
2183 int compat = 1;
2184
1aa99e6b 2185 if (uc >= 0x100 ||
516a5887 2186 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2187 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2188 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2189 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2190 )
653099ff
GS
2191 compat = 0;
2192 ANYOF_CLASS_ZERO(data->start_class);
2193 ANYOF_BITMAP_ZERO(data->start_class);
2194 if (compat)
1aa99e6b 2195 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2196 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2197 if (uc < 0x100)
2198 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2199 }
2200 else if (flags & SCF_DO_STCLASS_OR) {
2201 /* false positive possible if the class is case-folded */
1aa99e6b 2202 if (uc < 0x100)
9b877dbb
IH
2203 ANYOF_BITMAP_SET(data->start_class, uc);
2204 else
2205 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2206 data->start_class->flags &= ~ANYOF_EOS;
2207 cl_and(data->start_class, &and_with);
2208 }
2209 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2210 }
3dab1dad 2211 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2212 I32 l = STR_LEN(scan);
1aa99e6b 2213 UV uc = *((U8*)STRING(scan));
653099ff
GS
2214
2215 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2216 if (flags & SCF_DO_SUBSTR) {
2217 assert(data);
830247a4 2218 scan_commit(pRExC_state, data);
ecaa9b9c 2219 }
a0ed51b3 2220 if (UTF) {
6136c704 2221 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2222 l = utf8_length(s, s + l);
9041c2e3 2223 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2224 }
2225 min += l;
ecaa9b9c 2226 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2227 data->pos_min += l;
653099ff
GS
2228 if (flags & SCF_DO_STCLASS_AND) {
2229 /* Check whether it is compatible with what we know already! */
2230 int compat = 1;
2231
1aa99e6b 2232 if (uc >= 0x100 ||
516a5887 2233 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2234 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2235 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2236 compat = 0;
2237 ANYOF_CLASS_ZERO(data->start_class);
2238 ANYOF_BITMAP_ZERO(data->start_class);
2239 if (compat) {
1aa99e6b 2240 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2241 data->start_class->flags &= ~ANYOF_EOS;
2242 data->start_class->flags |= ANYOF_FOLD;
2243 if (OP(scan) == EXACTFL)
2244 data->start_class->flags |= ANYOF_LOCALE;
2245 }
2246 }
2247 else if (flags & SCF_DO_STCLASS_OR) {
2248 if (data->start_class->flags & ANYOF_FOLD) {
2249 /* false positive possible if the class is case-folded.
2250 Assume that the locale settings are the same... */
1aa99e6b
IH
2251 if (uc < 0x100)
2252 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2253 data->start_class->flags &= ~ANYOF_EOS;
2254 }
2255 cl_and(data->start_class, &and_with);
2256 }
2257 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2258 }
3dab1dad
YO
2259#ifdef TRIE_STUDY_OPT
2260 else if (OP(scan) == TRIE) {
2261 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2262 min += trie->minlen;
2263 flags &= ~SCF_DO_STCLASS; /* xxx */
2264 if (flags & SCF_DO_SUBSTR) {
2265 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2266 data->pos_min += trie->minlen;
2267 data->pos_delta+= (trie->maxlen-trie->minlen);
2268 }
2269 }
2270#endif
bfed75c6 2271 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2272 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2273 I32 f = flags, pos_before = 0;
d4c19fe8 2274 regnode * const oscan = scan;
653099ff
GS
2275 struct regnode_charclass_class this_class;
2276 struct regnode_charclass_class *oclass = NULL;
727f22e3 2277 I32 next_is_eval = 0;
653099ff 2278
3dab1dad 2279 switch (PL_regkind[OP(scan)]) {
653099ff 2280 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2281 scan = NEXTOPER(scan);
2282 goto finish;
2283 case PLUS:
653099ff 2284 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2285 next = NEXTOPER(scan);
653099ff 2286 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2287 mincount = 1;
2288 maxcount = REG_INFTY;
c277df42
IZ
2289 next = regnext(scan);
2290 scan = NEXTOPER(scan);
2291 goto do_curly;
2292 }
2293 }
2294 if (flags & SCF_DO_SUBSTR)
2295 data->pos_min++;
2296 min++;
2297 /* Fall through. */
2298 case STAR:
653099ff
GS
2299 if (flags & SCF_DO_STCLASS) {
2300 mincount = 0;
b81d288d 2301 maxcount = REG_INFTY;
653099ff
GS
2302 next = regnext(scan);
2303 scan = NEXTOPER(scan);
2304 goto do_curly;
2305 }
b81d288d 2306 is_inf = is_inf_internal = 1;
c277df42
IZ
2307 scan = regnext(scan);
2308 if (flags & SCF_DO_SUBSTR) {
830247a4 2309 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2310 data->longest = &(data->longest_float);
2311 }
2312 goto optimize_curly_tail;
2313 case CURLY:
b81d288d 2314 mincount = ARG1(scan);
c277df42
IZ
2315 maxcount = ARG2(scan);
2316 next = regnext(scan);
cb434fcc
IZ
2317 if (OP(scan) == CURLYX) {
2318 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2319 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2320 }
c277df42 2321 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2322 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2323 do_curly:
2324 if (flags & SCF_DO_SUBSTR) {
830247a4 2325 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2326 pos_before = data->pos_min;
2327 }
2328 if (data) {
2329 fl = data->flags;
2330 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2331 if (is_inf)
2332 data->flags |= SF_IS_INF;
2333 }
653099ff 2334 if (flags & SCF_DO_STCLASS) {
830247a4 2335 cl_init(pRExC_state, &this_class);
653099ff
GS
2336 oclass = data->start_class;
2337 data->start_class = &this_class;
2338 f |= SCF_DO_STCLASS_AND;
2339 f &= ~SCF_DO_STCLASS_OR;
2340 }
e1901655
IZ
2341 /* These are the cases when once a subexpression
2342 fails at a particular position, it cannot succeed
2343 even after backtracking at the enclosing scope.
b81d288d 2344
e1901655
IZ
2345 XXXX what if minimal match and we are at the
2346 initial run of {n,m}? */
2347 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2348 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2349
c277df42 2350 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2351 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2352 (mincount == 0
2353 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2354
2355 if (flags & SCF_DO_STCLASS)
2356 data->start_class = oclass;
2357 if (mincount == 0 || minnext == 0) {
2358 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2359 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2360 }
2361 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2362 /* Switch to OR mode: cache the old value of
653099ff
GS
2363 * data->start_class */
2364 StructCopy(data->start_class, &and_with,
2365 struct regnode_charclass_class);
2366 flags &= ~SCF_DO_STCLASS_AND;
2367 StructCopy(&this_class, data->start_class,
2368 struct regnode_charclass_class);
2369 flags |= SCF_DO_STCLASS_OR;
2370 data->start_class->flags |= ANYOF_EOS;
2371 }
2372 } else { /* Non-zero len */
2373 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2374 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2375 cl_and(data->start_class, &and_with);
2376 }
2377 else if (flags & SCF_DO_STCLASS_AND)
2378 cl_and(data->start_class, &this_class);
2379 flags &= ~SCF_DO_STCLASS;
2380 }
c277df42
IZ
2381 if (!scan) /* It was not CURLYX, but CURLY. */
2382 scan = next;
041457d9
DM
2383 if ( /* ? quantifier ok, except for (?{ ... }) */
2384 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2385 && (minnext == 0) && (deltanext == 0)
99799961 2386 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2387 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2388 && ckWARN(WARN_REGEXP))
b45f050a 2389 {
830247a4 2390 vWARN(RExC_parse,
b45f050a
JF
2391 "Quantifier unexpected on zero-length expression");
2392 }
2393
c277df42 2394 min += minnext * mincount;
b81d288d 2395 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2396 && (minnext + deltanext) > 0)
2397 || deltanext == I32_MAX);
aca2d497 2398 is_inf |= is_inf_internal;
c277df42
IZ
2399 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2400
2401 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2402 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2403 && data->flags & SF_IN_PAR
2404 && !(data->flags & SF_HAS_EVAL)
2405 && !deltanext && minnext == 1 ) {
2406 /* Try to optimize to CURLYN. */
2407 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2408 regnode * const nxt1 = nxt;
497b47a8
JH
2409#ifdef DEBUGGING
2410 regnode *nxt2;
2411#endif
c277df42
IZ
2412
2413 /* Skip open. */
2414 nxt = regnext(nxt);
bfed75c6 2415 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2416 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2417 && STR_LEN(nxt) == 1))
c277df42 2418 goto nogo;
497b47a8 2419#ifdef DEBUGGING
c277df42 2420 nxt2 = nxt;
497b47a8 2421#endif
c277df42 2422 nxt = regnext(nxt);
b81d288d 2423 if (OP(nxt) != CLOSE)
c277df42
IZ
2424 goto nogo;
2425 /* Now we know that nxt2 is the only contents: */
eb160463 2426 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2427 OP(oscan) = CURLYN;
2428 OP(nxt1) = NOTHING; /* was OPEN. */
2429#ifdef DEBUGGING
2430 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2431 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2432 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2433 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2434 OP(nxt + 1) = OPTIMIZED; /* was count. */
2435 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2436#endif
c277df42 2437 }
c277df42
IZ
2438 nogo:
2439
2440 /* Try optimization CURLYX => CURLYM. */
b81d288d 2441 if ( OP(oscan) == CURLYX && data
c277df42 2442 && !(data->flags & SF_HAS_PAR)
c277df42 2443 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2444 && !deltanext /* atom is fixed width */
2445 && minnext != 0 /* CURLYM can't handle zero width */
2446 ) {
c277df42
IZ
2447 /* XXXX How to optimize if data == 0? */
2448 /* Optimize to a simpler form. */
2449 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2450 regnode *nxt2;
2451
2452 OP(oscan) = CURLYM;
2453 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2454 && (OP(nxt2) != WHILEM))
c277df42
IZ
2455 nxt = nxt2;
2456 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2457 /* Need to optimize away parenths. */
2458 if (data->flags & SF_IN_PAR) {
2459 /* Set the parenth number. */
2460 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2461
b81d288d 2462 if (OP(nxt) != CLOSE)
b45f050a 2463 FAIL("Panic opt close");
eb160463 2464 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2465 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2466 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2467#ifdef DEBUGGING
2468 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2469 OP(nxt + 1) = OPTIMIZED; /* was count. */
2470 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2471 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2472#endif
c277df42
IZ
2473#if 0
2474 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2475 regnode *nnxt = regnext(nxt1);
b81d288d 2476
c277df42
IZ
2477 if (nnxt == nxt) {
2478 if (reg_off_by_arg[OP(nxt1)])
2479 ARG_SET(nxt1, nxt2 - nxt1);
2480 else if (nxt2 - nxt1 < U16_MAX)
2481 NEXT_OFF(nxt1) = nxt2 - nxt1;
2482 else
2483 OP(nxt) = NOTHING; /* Cannot beautify */
2484 }
2485 nxt1 = nnxt;
2486 }
2487#endif
2488 /* Optimize again: */
b81d288d 2489 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2490 NULL, 0,depth+1);
a0ed51b3
LW
2491 }
2492 else
c277df42 2493 oscan->flags = 0;
c277df42 2494 }
e1901655
IZ
2495 else if ((OP(oscan) == CURLYX)
2496 && (flags & SCF_WHILEM_VISITED_POS)
2497 /* See the comment on a similar expression above.
2498 However, this time it not a subexpression
2499 we care about, but the expression itself. */
2500 && (maxcount == REG_INFTY)
2501 && data && ++data->whilem_c < 16) {
2502 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2503 /* Find WHILEM (as in regexec.c) */
2504 regnode *nxt = oscan + NEXT_OFF(oscan);
2505
2506 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2507 nxt += ARG(nxt);
eb160463
GS
2508 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2509 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2510 }
b81d288d 2511 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2512 pars++;
2513 if (flags & SCF_DO_SUBSTR) {
c445ea15 2514 SV *last_str = NULL;
c277df42
IZ
2515 int counted = mincount != 0;
2516
2517 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2518#if defined(SPARC64_GCC_WORKAROUND)
2519 I32 b = 0;
2520 STRLEN l = 0;
cfd0369c 2521 const char *s = NULL;
5d1c421c
JH
2522 I32 old = 0;
2523
2524 if (pos_before >= data->last_start_min)
2525 b = pos_before;
2526 else
2527 b = data->last_start_min;
2528
2529 l = 0;
cfd0369c 2530 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2531 old = b - data->last_start_min;
2532
2533#else
b81d288d 2534 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2535 ? pos_before : data->last_start_min;
2536 STRLEN l;
d4c19fe8 2537 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2538 I32 old = b - data->last_start_min;
5d1c421c 2539#endif
a0ed51b3
LW
2540
2541 if (UTF)
2542 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2543
a0ed51b3 2544 l -= old;
c277df42 2545 /* Get the added string: */
79cb57f6 2546 last_str = newSVpvn(s + old, l);
0e933229
IH
2547 if (UTF)
2548 SvUTF8_on(last_str);
c277df42
IZ
2549 if (deltanext == 0 && pos_before == b) {
2550 /* What was added is a constant string */
2551 if (mincount > 1) {
2552 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2553 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2554 SvPVX_const(last_str), l, mincount - 1);
b162af07 2555 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2556 /* Add additional parts. */
b81d288d 2557 SvCUR_set(data->last_found,
c277df42
IZ
2558 SvCUR(data->last_found) - l);
2559 sv_catsv(data->last_found, last_str);
0eda9292
JH
2560 {
2561 SV * sv = data->last_found;
2562 MAGIC *mg =
2563 SvUTF8(sv) && SvMAGICAL(sv) ?
2564 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2565 if (mg && mg->mg_len >= 0)
2566 mg->mg_len += CHR_SVLEN(last_str);
2567 }
c277df42
IZ
2568 data->last_end += l * (mincount - 1);
2569 }
2a8d9689
HS
2570 } else {
2571 /* start offset must point into the last copy */
2572 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2573 data->last_start_max += is_inf ? I32_MAX
2574 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2575 }
2576 }
2577 /* It is counted once already... */
2578 data->pos_min += minnext * (mincount - counted);
2579 data->pos_delta += - counted * deltanext +
2580 (minnext + deltanext) * maxcount - minnext * mincount;
2581 if (mincount != maxcount) {
653099ff
GS
2582 /* Cannot extend fixed substrings found inside
2583 the group. */
830247a4 2584 scan_commit(pRExC_state,data);
c277df42 2585 if (mincount && last_str) {
d4c19fe8
AL
2586 SV * const sv = data->last_found;
2587 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2588 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2589
2590 if (mg)
2591 mg->mg_len = -1;
2592 sv_setsv(sv, last_str);
c277df42 2593 data->last_end = data->pos_min;
b81d288d 2594 data->last_start_min =
a0ed51b3 2595 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2596 data->last_start_max = is_inf
2597 ? I32_MAX
c277df42 2598 : data->pos_min + data->pos_delta
a0ed51b3 2599 - CHR_SVLEN(last_str);
c277df42
IZ
2600 }
2601 data->longest = &(data->longest_float);
2602 }
aca2d497 2603 SvREFCNT_dec(last_str);
c277df42 2604 }
405ff068 2605 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2606 data->flags |= SF_HAS_EVAL;
2607 optimize_curly_tail:
c277df42 2608 if (OP(oscan) != CURLYX) {
3dab1dad 2609 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2610 && NEXT_OFF(next))
2611 NEXT_OFF(oscan) += NEXT_OFF(next);
2612 }
c277df42 2613 continue;
653099ff 2614 default: /* REF and CLUMP only? */
c277df42 2615 if (flags & SCF_DO_SUBSTR) {
830247a4 2616 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2617 data->longest = &(data->longest_float);
2618 }
aca2d497 2619 is_inf = is_inf_internal = 1;
653099ff 2620 if (flags & SCF_DO_STCLASS_OR)
830247a4 2621 cl_anything(pRExC_state, data->start_class);
653099ff 2622 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2623 break;
2624 }
a0ed51b3 2625 }
bfed75c6 2626 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2627 int value = 0;
653099ff 2628
c277df42 2629 if (flags & SCF_DO_SUBSTR) {
830247a4 2630 scan_commit(pRExC_state,data);
c277df42
IZ
2631 data->pos_min++;
2632 }
2633 min++;
653099ff
GS
2634 if (flags & SCF_DO_STCLASS) {
2635 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2636
2637 /* Some of the logic below assumes that switching
2638 locale on will only add false positives. */
3dab1dad 2639 switch (PL_regkind[OP(scan)]) {
653099ff 2640 case SANY:
653099ff
GS
2641 default:
2642 do_default:
2643 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2645 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2646 break;
2647 case REG_ANY:
2648 if (OP(scan) == SANY)
2649 goto do_default;
2650 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2651 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2652 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2653 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2654 }
2655 if (flags & SCF_DO_STCLASS_AND || !value)
2656 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2657 break;
2658 case ANYOF:
2659 if (flags & SCF_DO_STCLASS_AND)
2660 cl_and(data->start_class,
2661 (struct regnode_charclass_class*)scan);
2662 else
830247a4 2663 cl_or(pRExC_state, data->start_class,
653099ff
GS
2664 (struct regnode_charclass_class*)scan);
2665 break;
2666 case ALNUM:
2667 if (flags & SCF_DO_STCLASS_AND) {
2668 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2669 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2670 for (value = 0; value < 256; value++)
2671 if (!isALNUM(value))
2672 ANYOF_BITMAP_CLEAR(data->start_class, value);
2673 }
2674 }
2675 else {
2676 if (data->start_class->flags & ANYOF_LOCALE)
2677 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2678 else {
2679 for (value = 0; value < 256; value++)
2680 if (isALNUM(value))
b81d288d 2681 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2682 }
2683 }
2684 break;
2685 case ALNUML:
2686 if (flags & SCF_DO_STCLASS_AND) {
2687 if (data->start_class->flags & ANYOF_LOCALE)
2688 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2689 }
2690 else {
2691 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2692 data->start_class->flags |= ANYOF_LOCALE;
2693 }
2694 break;
2695 case NALNUM:
2696 if (flags & SCF_DO_STCLASS_AND) {
2697 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2698 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2699 for (value = 0; value < 256; value++)
2700 if (isALNUM(value))
2701 ANYOF_BITMAP_CLEAR(data->start_class, value);
2702 }
2703 }
2704 else {
2705 if (data->start_class->flags & ANYOF_LOCALE)
2706 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2707 else {
2708 for (value = 0; value < 256; value++)
2709 if (!isALNUM(value))
b81d288d 2710 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2711 }
2712 }
2713 break;
2714 case NALNUML:
2715 if (flags & SCF_DO_STCLASS_AND) {
2716 if (data->start_class->flags & ANYOF_LOCALE)
2717 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2718 }
2719 else {
2720 data->start_class->flags |= ANYOF_LOCALE;
2721 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2722 }
2723 break;
2724 case SPACE:
2725 if (flags & SCF_DO_STCLASS_AND) {
2726 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2727 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2728 for (value = 0; value < 256; value++)
2729 if (!isSPACE(value))
2730 ANYOF_BITMAP_CLEAR(data->start_class, value);
2731 }
2732 }
2733 else {
2734 if (data->start_class->flags & ANYOF_LOCALE)
2735 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2736 else {
2737 for (value = 0; value < 256; value++)
2738 if (isSPACE(value))
b81d288d 2739 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2740 }
2741 }
2742 break;
2743 case SPACEL:
2744 if (flags & SCF_DO_STCLASS_AND) {
2745 if (data->start_class->flags & ANYOF_LOCALE)
2746 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2747 }
2748 else {
2749 data->start_class->flags |= ANYOF_LOCALE;
2750 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2751 }
2752 break;
2753 case NSPACE:
2754 if (flags & SCF_DO_STCLASS_AND) {
2755 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2756 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2757 for (value = 0; value < 256; value++)
2758 if (isSPACE(value))
2759 ANYOF_BITMAP_CLEAR(data->start_class, value);
2760 }
2761 }
2762 else {
2763 if (data->start_class->flags & ANYOF_LOCALE)
2764 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2765 else {
2766 for (value = 0; value < 256; value++)
2767 if (!isSPACE(value))
b81d288d 2768 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2769 }
2770 }
2771 break;
2772 case NSPACEL:
2773 if (flags & SCF_DO_STCLASS_AND) {
2774 if (data->start_class->flags & ANYOF_LOCALE) {
2775 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2776 for (value = 0; value < 256; value++)
2777 if (!isSPACE(value))
2778 ANYOF_BITMAP_CLEAR(data->start_class, value);
2779 }
2780 }
2781 else {
2782 data->start_class->flags |= ANYOF_LOCALE;
2783 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2784 }
2785 break;
2786 case DIGIT:
2787 if (flags & SCF_DO_STCLASS_AND) {
2788 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2789 for (value = 0; value < 256; value++)
2790 if (!isDIGIT(value))
2791 ANYOF_BITMAP_CLEAR(data->start_class, value);
2792 }
2793 else {
2794 if (data->start_class->flags & ANYOF_LOCALE)
2795 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2796 else {
2797 for (value = 0; value < 256; value++)
2798 if (isDIGIT(value))
b81d288d 2799 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2800 }
2801 }
2802 break;
2803 case NDIGIT:
2804 if (flags & SCF_DO_STCLASS_AND) {
2805 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2806 for (value = 0; value < 256; value++)
2807 if (isDIGIT(value))
2808 ANYOF_BITMAP_CLEAR(data->start_class, value);
2809 }
2810 else {
2811 if (data->start_class->flags & ANYOF_LOCALE)
2812 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2813 else {
2814 for (value = 0; value < 256; value++)
2815 if (!isDIGIT(value))
b81d288d 2816 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2817 }
2818 }
2819 break;
2820 }
2821 if (flags & SCF_DO_STCLASS_OR)
2822 cl_and(data->start_class, &and_with);
2823 flags &= ~SCF_DO_STCLASS;
2824 }
a0ed51b3 2825 }
3dab1dad 2826 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2827 data->flags |= (OP(scan) == MEOL
2828 ? SF_BEFORE_MEOL
2829 : SF_BEFORE_SEOL);
a0ed51b3 2830 }
3dab1dad 2831 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
2832 /* Lookbehind, or need to calculate parens/evals/stclass: */
2833 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2834 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2835 /* Lookahead/lookbehind */
cb434fcc 2836 I32 deltanext, minnext, fake = 0;
c277df42 2837 regnode *nscan;
653099ff
GS
2838 struct regnode_charclass_class intrnl;
2839 int f = 0;
c277df42
IZ
2840
2841 data_fake.flags = 0;
b81d288d 2842 if (data) {
2c2d71f5 2843 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2844 data_fake.last_closep = data->last_closep;
2845 }
2846 else
2847 data_fake.last_closep = &fake;
653099ff
GS
2848 if ( flags & SCF_DO_STCLASS && !scan->flags
2849 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2850 cl_init(pRExC_state, &intrnl);
653099ff 2851 data_fake.start_class = &intrnl;
e1901655 2852 f |= SCF_DO_STCLASS_AND;
653099ff 2853 }
e1901655
IZ
2854 if (flags & SCF_WHILEM_VISITED_POS)
2855 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2856 next = regnext(scan);
2857 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2858 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2859 if (scan->flags) {
2860 if (deltanext) {
9baa0206 2861 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2862 }
2863 else if (minnext > U8_MAX) {
9baa0206 2864 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2865 }
eb160463 2866 scan->flags = (U8)minnext;
c277df42
IZ
2867 }
2868 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2869 pars++;
405ff068 2870 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2871 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2872 if (data)
2873 data->whilem_c = data_fake.whilem_c;
e1901655 2874 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2875 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2876
2877 cl_and(data->start_class, &intrnl);
2878 if (was)
2879 data->start_class->flags |= ANYOF_EOS;
2880 }
a0ed51b3
LW
2881 }
2882 else if (OP(scan) == OPEN) {
c277df42 2883 pars++;
a0ed51b3 2884 }
cb434fcc 2885 else if (OP(scan) == CLOSE) {
eb160463 2886 if ((I32)ARG(scan) == is_par) {
cb434fcc 2887 next = regnext(scan);
c277df42 2888
cb434fcc
IZ
2889 if ( next && (OP(next) != WHILEM) && next < last)
2890 is_par = 0; /* Disable optimization */
2891 }
2892 if (data)
2893 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2894 }
2895 else if (OP(scan) == EVAL) {
c277df42
IZ
2896 if (data)
2897 data->flags |= SF_HAS_EVAL;
2898 }
96776eda 2899 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2900 if (flags & SCF_DO_SUBSTR) {
830247a4 2901 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2902 data->longest = &(data->longest_float);
2903 }
2904 is_inf = is_inf_internal = 1;
653099ff 2905 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2906 cl_anything(pRExC_state, data->start_class);
96776eda 2907 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2908 }
c277df42
IZ
2909 /* Else: zero-length, ignore. */
2910 scan = regnext(scan);
2911 }
2912
2913 finish:
2914 *scanp = scan;
aca2d497 2915 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2916 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2917 data->pos_delta = I32_MAX - data->pos_min;
2918 if (is_par > U8_MAX)
2919 is_par = 0;
2920 if (is_par && pars==1 && data) {
2921 data->flags |= SF_IN_PAR;
2922 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2923 }
2924 else if (pars && data) {
c277df42
IZ
2925 data->flags |= SF_HAS_PAR;
2926 data->flags &= ~SF_IN_PAR;
2927 }
653099ff
GS
2928 if (flags & SCF_DO_STCLASS_OR)
2929 cl_and(data->start_class, &and_with);
c277df42
IZ
2930 return min;
2931}
2932
76e3520e 2933STATIC I32
5f66b61c 2934S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2935{
830247a4 2936 if (RExC_rx->data) {
b81d288d
AB
2937 Renewc(RExC_rx->data,
2938 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2939 char, struct reg_data);
830247a4
IZ
2940 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2941 RExC_rx->data->count += n;
a0ed51b3
LW
2942 }
2943 else {
a02a5408 2944 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2945 char, struct reg_data);
a02a5408 2946 Newx(RExC_rx->data->what, n, U8);
830247a4 2947 RExC_rx->data->count = n;
c277df42 2948 }
830247a4
IZ
2949 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2950 return RExC_rx->data->count - n;
c277df42
IZ
2951}
2952
76234dfb 2953#ifndef PERL_IN_XSUB_RE
d88dccdf 2954void
864dbfa3 2955Perl_reginitcolors(pTHX)
d88dccdf 2956{
97aff369 2957 dVAR;
1df70142 2958 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2959 if (s) {
1df70142
AL
2960 char *t = savepv(s);
2961 int i = 0;
2962 PL_colors[0] = t;
d88dccdf 2963 while (++i < 6) {
1df70142
AL
2964 t = strchr(t, '\t');
2965 if (t) {
2966 *t = '\0';
2967 PL_colors[i] = ++t;
d88dccdf
IZ
2968 }
2969 else
1df70142 2970 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2971 }
2972 } else {
1df70142 2973 int i = 0;
b81d288d 2974 while (i < 6)
06b5626a 2975 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2976 }
2977 PL_colorset = 1;
2978}
76234dfb 2979#endif
8615cb43 2980
a687059c 2981/*
e50aee73 2982 - pregcomp - compile a regular expression into internal code
a687059c
LW
2983 *
2984 * We can't allocate space until we know how big the compiled form will be,
2985 * but we can't compile it (and thus know how big it is) until we've got a
2986 * place to put the code. So we cheat: we compile it twice, once with code
2987 * generation turned off and size counting turned on, and once "for real".
2988 * This also means that we don't allocate space until we are sure that the
2989 * thing really will compile successfully, and we never have to move the
2990 * code and thus invalidate pointers into it. (Note that it has to be in
2991 * one piece because free() must be able to free it all.) [NB: not true in perl]
2992 *
2993 * Beware that the optimization-preparation code in here knows about some
2994 * of the structure of the compiled regexp. [I'll say.]
2995 */
2996regexp *
864dbfa3 2997Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2998{
97aff369 2999 dVAR;
a0d0e21e 3000 register regexp *r;
c277df42 3001 regnode *scan;
c277df42 3002 regnode *first;
a0d0e21e 3003 I32 flags;
a0d0e21e
LW
3004 I32 minlen = 0;
3005 I32 sawplus = 0;
3006 I32 sawopen = 0;
2c2d71f5 3007 scan_data_t data;
830247a4
IZ
3008 RExC_state_t RExC_state;
3009 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 3010
a3621e74
YO
3011 GET_RE_DEBUG_FLAGS_DECL;
3012
a0d0e21e 3013 if (exp == NULL)
c277df42 3014 FAIL("NULL regexp argument");
a0d0e21e 3015
a5961de5 3016 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3017
5cfc7842 3018 RExC_precomp = exp;
a3621e74
YO
3019 DEBUG_r(if (!PL_colorset) reginitcolors());
3020 DEBUG_COMPILE_r({
3021 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3022 PL_colors[4],PL_colors[5],PL_colors[0],
3023 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3024 });
e2509266 3025 RExC_flags = pm->op_pmflags;
830247a4 3026 RExC_sawback = 0;
bbce6d69 3027
830247a4
IZ
3028 RExC_seen = 0;
3029 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3030 RExC_seen_evals = 0;
3031 RExC_extralen = 0;
c277df42 3032
bbce6d69 3033 /* First pass: determine size, legality. */
830247a4 3034 RExC_parse = exp;
fac92740 3035 RExC_start = exp;
830247a4
IZ
3036 RExC_end = xend;
3037 RExC_naughty = 0;
3038 RExC_npar = 1;
3039 RExC_size = 0L;
3040 RExC_emit = &PL_regdummy;
3041 RExC_whilem_seen = 0;
85ddcde9
JH
3042#if 0 /* REGC() is (currently) a NOP at the first pass.
3043 * Clever compilers notice this and complain. --jhi */
830247a4 3044 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3045#endif
3dab1dad
YO
3046 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3047 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3048 RExC_precomp = NULL;
a0d0e21e
LW
3049 return(NULL);
3050 }
3dab1dad
YO
3051 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3052 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3053 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
c277df42 3054
c277df42
IZ
3055 /* Small enough for pointer-storage convention?
3056 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3057 if (RExC_size >= 0x10000L && RExC_extralen)
3058 RExC_size += RExC_extralen;
c277df42 3059 else
830247a4
IZ
3060 RExC_extralen = 0;
3061 if (RExC_whilem_seen > 15)
3062 RExC_whilem_seen = 15;
a0d0e21e 3063
bbce6d69 3064 /* Allocate space and initialize. */
a02a5408 3065 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3066 char, regexp);
a0d0e21e 3067 if (r == NULL)
b45f050a
JF
3068 FAIL("Regexp out of space");
3069
0f79a09d
GS
3070#ifdef DEBUGGING
3071 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3072 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3073#endif
c277df42 3074 r->refcnt = 1;
bbce6d69 3075 r->prelen = xend - exp;
5cfc7842 3076 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3077 r->subbeg = NULL;
f8c7b90f 3078#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3079 r->saved_copy = NULL;
ed252734 3080#endif
cf93c79d 3081 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3082 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3083 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3084
3085 r->substrs = 0; /* Useful during FAIL. */
3086 r->startp = 0; /* Useful during FAIL. */
3087 r->endp = 0; /* Useful during FAIL. */
3088
a02a5408 3089 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3090 if (r->offsets) {
2af232bd 3091 r->offsets[0] = RExC_size;
fac92740 3092 }
a3621e74 3093 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3094 "%s %"UVuf" bytes for offset annotations.\n",
3095 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3096 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3097
830247a4 3098 RExC_rx = r;
bbce6d69 3099
3100 /* Second pass: emit code. */
e2509266 3101 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3102 RExC_parse = exp;
3103 RExC_end = xend;
3104 RExC_naughty = 0;
3105 RExC_npar = 1;
fac92740 3106 RExC_emit_start = r->program;
830247a4 3107 RExC_emit = r->program;
2cd61cdb 3108 /* Store the count of eval-groups for security checks: */
eb160463 3109 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3110 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3111 r->data = 0;
3dab1dad 3112 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e
LW
3113 return(NULL);
3114
a3621e74 3115
a0d0e21e 3116 /* Dig out information for optimizations. */
cf93c79d 3117 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3118 pm->op_pmflags = RExC_flags;
a0ed51b3 3119 if (UTF)
5ff6fc6d 3120 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3121 r->regstclass = NULL;
830247a4 3122 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3123 r->reganch |= ROPT_NAUGHTY;
c277df42 3124 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
3125
3126 /* XXXX To minimize changes to RE engine we always allocate
3127 3-units-long substrs field. */
a02a5408 3128 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 3129
2c2d71f5 3130 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 3131 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3132 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3133 I32 fake;
c5254dd6 3134 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
3135 struct regnode_charclass_class ch_class;
3136 int stclass_flag;
cb434fcc 3137 I32 last_close = 0;
a0d0e21e
LW
3138
3139 first = scan;
c277df42 3140 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3141 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3142 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
3143 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3144 (OP(first) == PLUS) ||
3145 (OP(first) == MINMOD) ||
653099ff 3146 /* An {n,m} with n>0 */
3dab1dad 3147 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
3148 if (OP(first) == PLUS)
3149 sawplus = 1;
3150 else
3dab1dad 3151 first += regarglen[OP(first)];
a0d0e21e 3152 first = NEXTOPER(first);
a687059c
LW
3153 }
3154
a0d0e21e
LW
3155 /* Starting-point info. */
3156 again:
3dab1dad 3157 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3158 if (OP(first) == EXACT)
6f207bd3 3159 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3160 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3161 r->regstclass = first;
3162 }
bfed75c6 3163 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3164 r->regstclass = first;
3dab1dad
YO
3165 else if (PL_regkind[OP(first)] == BOUND ||
3166 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3167 r->regstclass = first;
3dab1dad 3168 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3169 r->reganch |= (OP(first) == MBOL
3170 ? ROPT_ANCH_MBOL
3171 : (OP(first) == SBOL
3172 ? ROPT_ANCH_SBOL
3173 : ROPT_ANCH_BOL));
a0d0e21e 3174 first = NEXTOPER(first);
774d564b 3175 goto again;
3176 }
3177 else if (OP(first) == GPOS) {
3178 r->reganch |= ROPT_ANCH_GPOS;
3179 first = NEXTOPER(first);
3180 goto again;
a0d0e21e 3181 }
e09294f4 3182 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3183 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3184 !(r->reganch & ROPT_ANCH) )
3185 {
3186 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3187 const int type =
3188 (OP(NEXTOPER(first)) == REG_ANY)
3189 ? ROPT_ANCH_MBOL
3190 : ROPT_ANCH_SBOL;
cad2e5aa 3191 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3192 first = NEXTOPER(first);
774d564b 3193 goto again;
a0d0e21e 3194 }
b81d288d 3195 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3196 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3197 /* x+ must match at the 1st pos of run of x's */
3198 r->reganch |= ROPT_SKIP;
a0d0e21e 3199
c277df42 3200 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3201 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3202 (IV)(first - scan + 1)));
a0d0e21e
LW
3203 /*
3204 * If there's something expensive in the r.e., find the
3205 * longest literal string that must appear and make it the
3206 * regmust. Resolve ties in favor of later strings, since
3207 * the regstart check works with the beginning of the r.e.
3208 * and avoiding duplication strengthens checking. Not a
3209 * strong reason, but sufficient in the absence of others.
3210 * [Now we resolve ties in favor of the earlier string if
c277df42 3211 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3212 * earlier string may buy us something the later one won't.]
3213 */
a0d0e21e 3214 minlen = 0;
a687059c 3215
396482e1
GA
3216 data.longest_fixed = newSVpvs("");
3217 data.longest_float = newSVpvs("");
3218 data.last_found = newSVpvs("");
c277df42
IZ
3219 data.longest = &(data.longest_fixed);
3220 first = scan;
653099ff 3221 if (!r->regstclass) {
830247a4 3222 cl_init(pRExC_state, &ch_class);
653099ff
GS
3223 data.start_class = &ch_class;
3224 stclass_flag = SCF_DO_STCLASS_AND;
3225 } else /* XXXX Check for BOUND? */
3226 stclass_flag = 0;
cb434fcc 3227 data.last_closep = &last_close;
653099ff 3228
830247a4 3229 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3230 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3231 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3232 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3233 && !RExC_seen_zerolen
3234 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3235 r->reganch |= ROPT_CHECK_ALL;
830247a4 3236 scan_commit(pRExC_state, &data);
c277df42
IZ
3237 SvREFCNT_dec(data.last_found);
3238
a0ed51b3 3239 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3240 if (longest_float_length
c277df42
IZ
3241 || (data.flags & SF_FL_BEFORE_EOL
3242 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3243 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3244 int t;
3245
a0ed51b3 3246 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3247 && data.offset_fixed == data.offset_float_min
3248 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3249 goto remove_float; /* As in (a)+. */
3250
33b8afdf
JH
3251 if (SvUTF8(data.longest_float)) {
3252 r->float_utf8 = data.longest_float;
c445ea15 3253 r->float_substr = NULL;
33b8afdf
JH
3254 } else {
3255 r->float_substr = data.longest_float;
c445ea15 3256 r->float_utf8 = NULL;
33b8afdf 3257 }
c277df42
IZ
3258 r->float_min_offset = data.offset_float_min;
3259 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3260 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3261 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3262 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3263 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3264 }
3265 else {
aca2d497 3266 remove_float:
c445ea15 3267 r->float_substr = r->float_utf8 = NULL;
c277df42 3268 SvREFCNT_dec(data.longest_float);
c5254dd6 3269 longest_float_length = 0;
a0d0e21e 3270 }
c277df42 3271
a0ed51b3 3272 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3273 if (longest_fixed_length
c277df42
IZ
3274 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3275 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3276 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3277 int t;
3278
33b8afdf
JH
3279 if (SvUTF8(data.longest_fixed)) {
3280 r->anchored_utf8 = data.longest_fixed;
c445ea15 3281 r->anchored_substr = NULL;
33b8afdf
JH
3282 } else {
3283 r->anchored_substr = data.longest_fixed;
c445ea15 3284 r->anchored_utf8 = NULL;
33b8afdf 3285 }
c277df42 3286 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3287 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3288 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3289 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3290 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3291 }
3292 else {
c445ea15 3293 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3294 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3295 longest_fixed_length = 0;
a0d0e21e 3296 }
b81d288d 3297 if (r->regstclass
ffc61ed2 3298 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3299 r->regstclass = NULL;
33b8afdf
JH
3300 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3301 && stclass_flag
653099ff 3302 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3303 && !cl_is_anything(data.start_class))
3304 {
1df70142 3305 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3306
a02a5408 3307 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3308 struct regnode_charclass_class);
3309 StructCopy(data.start_class,
830247a4 3310 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3311 struct regnode_charclass_class);
830247a4 3312 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3313 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3314 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3315 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3316 PerlIO_printf(Perl_debug_log,
a0288114 3317 "synthetic stclass \"%s\".\n",
3f7c398e 3318 SvPVX_const(sv));});
653099ff 3319 }
c277df42
IZ
3320
3321 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3322 if (longest_fixed_length > longest_float_length) {
c277df42 3323 r->check_substr = r->anchored_substr;
33b8afdf 3324 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3325 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3326 if (r->reganch & ROPT_ANCH_SINGLE)
3327 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3328 }
3329 else {
c277df42 3330 r->check_substr = r->float_substr;
33b8afdf 3331 r->check_utf8 = r->float_utf8;
c277df42
IZ
3332 r->check_offset_min = data.offset_float_min;
3333 r->check_offset_max = data.offset_float_max;
a0d0e21e 3334 }
30382c73
IZ
3335 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3336 This should be changed ASAP! */
33b8afdf 3337 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3338 r->reganch |= RE_USE_INTUIT;
33b8afdf 3339 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3340 r->reganch |= RE_INTUIT_TAIL;
3341 }
a0ed51b3
LW
3342 }
3343 else {
c277df42
IZ
3344 /* Several toplevels. Best we can is to set minlen. */
3345 I32 fake;
653099ff 3346 struct regnode_charclass_class ch_class;
cb434fcc 3347 I32 last_close = 0;
c277df42 3348
a3621e74 3349 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3350 scan = r->program + 1;
830247a4 3351 cl_init(pRExC_state, &ch_class);
653099ff 3352 data.start_class = &ch_class;
cb434fcc 3353 data.last_closep = &last_close;
a3621e74 3354 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3355 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3356 = r->float_substr = r->float_utf8 = NULL;
653099ff 3357 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3358 && !cl_is_anything(data.start_class))
3359 {
1df70142 3360 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3361
a02a5408 3362 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3363 struct regnode_charclass_class);
3364 StructCopy(data.start_class,
830247a4 3365 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3366 struct regnode_charclass_class);
830247a4 3367 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3368 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3369 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3370 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3371 PerlIO_printf(Perl_debug_log,
a0288114 3372 "synthetic stclass \"%s\".\n",
3f7c398e 3373 SvPVX_const(sv));});
653099ff 3374 }
a0d0e21e
LW
3375 }
3376
a0d0e21e 3377 r->minlen = minlen;
b81d288d 3378 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3379 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3380 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3381 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3382 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3383 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3384 if (RExC_seen & REG_SEEN_CANY)
3385 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3386 Newxz(r->startp, RExC_npar, I32);
3387 Newxz(r->endp, RExC_npar, I32);
3dab1dad
YO
3388 DEBUG_COMPILE_r({
3389 if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
3390 PerlIO_printf(Perl_debug_log,"Final program:\n");
3391 regdump(r);
3392 });
a0d0e21e 3393 return(r);
a687059c
LW
3394}
3395
3dab1dad
YO
3396
3397#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3398 int rem=(int)(RExC_end - RExC_parse); \
3399 int cut; \
3400 int num; \
3401 int iscut=0; \
3402 if (rem>10) { \
3403 rem=10; \
3404 iscut=1; \
3405 } \
3406 cut=10-rem; \
3407 if (RExC_lastparse!=RExC_parse) \
3408 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3409 rem, RExC_parse, \
3410 cut + 4, \
3411 iscut ? "..." : "<" \
3412 ); \
3413 else \
3414 PerlIO_printf(Perl_debug_log,"%16s",""); \
3415 \
3416 if (SIZE_ONLY) \
3417 num=RExC_size; \
3418 else \
3419 num=REG_NODE_NUM(RExC_emit); \
3420 if (RExC_lastnum!=num) \
3421 PerlIO_printf(Perl_debug_log,"%4d",num); \
3422 else \
3423 PerlIO_printf(Perl_debug_log,"%4s",""); \
3424 PerlIO_printf(Perl_debug_log,"%*s%-4s", \
3425 10+(depth*2),"", \
3426 (funcname) \
3427 ); \
3428 RExC_lastnum=num; \
3429 RExC_lastparse=RExC_parse; \
3430})
3431
3432#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3433 DEBUG_PARSE_MSG((funcname)); \
3434 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3435})
a687059c
LW
3436/*
3437 - reg - regular expression, i.e. main body or parenthesized thing
3438 *
3439 * Caller must absorb opening parenthesis.
3440 *
3441 * Combining parenthesis handling with the base level of regular expression
3442 * is a trifle forced, but the need to tie the tails of the branches to what
3443 * follows makes it hard to avoid.
3444 */
3dab1dad
YO
3445#define REGTAIL(x,y,z) regtail(x,y,z,depth+1)
3446
76e3520e 3447STATIC regnode *
3dab1dad 3448S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3449 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3450{
27da23d5 3451 dVAR;
c277df42
IZ
3452 register regnode *ret; /* Will be the head of the group. */
3453 register regnode *br;
3454 register regnode *lastbr;
cbbf8932 3455 register regnode *ender = NULL;
a0d0e21e 3456 register I32 parno = 0;
cbbf8932
AL
3457 I32 flags;
3458 const I32 oregflags = RExC_flags;
6136c704
AL
3459 bool have_branch = 0;
3460 bool is_open = 0;
9d1d55b5
JP
3461
3462 /* for (?g), (?gc), and (?o) warnings; warning
3463 about (?c) will warn about (?g) -- japhy */
3464
6136c704
AL
3465#define WASTED_O 0x01
3466#define WASTED_G 0x02
3467#define WASTED_C 0x04
3468#define WASTED_GC (0x02|0x04)
cbbf8932 3469 I32 wastedflags = 0x00;
9d1d55b5 3470
fac92740 3471 char * parse_start = RExC_parse; /* MJD */
a28509cc 3472 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3473
3dab1dad
YO
3474 GET_RE_DEBUG_FLAGS_DECL;
3475 DEBUG_PARSE("reg ");
3476
3477
821b33a5 3478 *flagp = 0; /* Tentatively. */
a0d0e21e 3479
9d1d55b5 3480
a0d0e21e
LW
3481 /* Make an OPEN node, if parenthesized. */
3482 if (paren) {
fac92740 3483 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3484 U32 posflags = 0, negflags = 0;
3485 U32 *flagsp = &posflags;
6136c704 3486 bool is_logical = 0;
a28509cc 3487 const char * const seqstart = RExC_parse;
ca9dfc88 3488
830247a4
IZ
3489 RExC_parse++;
3490 paren = *RExC_parse++;
c277df42 3491 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3492 switch (paren) {
fac92740 3493 case '<': /* (?<...) */
830247a4 3494 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3495 if (*RExC_parse == '!')
c277df42 3496 paren = ',';
b81d288d 3497 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3498 goto unknown;
830247a4 3499 RExC_parse++;
fac92740
MJD
3500 case '=': /* (?=...) */
3501 case '!': /* (?!...) */
830247a4 3502 RExC_seen_zerolen++;
fac92740
MJD
3503 case ':': /* (?:...) */
3504 case '>': /* (?>...) */
a0d0e21e 3505 break;
fac92740
MJD
3506 case '$': /* (?$...) */
3507 case '@': /* (?@...) */
8615cb43 3508 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3509 break;
fac92740 3510 case '#': /* (?#...) */
830247a4
IZ
3511 while (*RExC_parse && *RExC_parse != ')')
3512 RExC_parse++;
3513 if (*RExC_parse != ')')
c277df42 3514 FAIL("Sequence (?#... not terminated");
830247a4 3515 nextchar(pRExC_state);
a0d0e21e
LW
3516 *flagp = TRYAGAIN;
3517 return NULL;
fac92740 3518 case 'p': /* (?p...) */
9014280d 3519 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3520 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3521 /* FALL THROUGH*/
fac92740 3522 case '?': /* (??...) */
6136c704 3523 is_logical = 1;
438a3801
YST
3524 if (*RExC_parse != '{')
3525 goto unknown;
830247a4 3526 paren = *RExC_parse++;
0f5d15d6 3527 /* FALL THROUGH */
fac92740 3528 case '{': /* (?{...}) */
c277df42 3529 {
c277df42
IZ
3530 I32 count = 1, n = 0;
3531 char c;
830247a4 3532 char *s = RExC_parse;
c277df42 3533
830247a4
IZ
3534 RExC_seen_zerolen++;
3535 RExC_seen |= REG_SEEN_EVAL;
3536 while (count && (c = *RExC_parse)) {
6136c704
AL
3537 if (c == '\\') {
3538 if (RExC_parse[1])
3539 RExC_parse++;
3540 }
b81d288d 3541 else if (c == '{')
c277df42 3542 count++;
b81d288d 3543 else if (c == '}')
c277df42 3544 count--;
830247a4 3545 RExC_parse++;
c277df42 3546 }
6136c704 3547 if (*RExC_parse != ')') {
b81d288d 3548 RExC_parse = s;
b45f050a
JF
3549 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3550 }
c277df42 3551 if (!SIZE_ONLY) {
f3548bdc 3552 PAD *pad;
6136c704
AL
3553 OP_4tree *sop, *rop;
3554 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3555
569233ed
SB
3556 ENTER;
3557 Perl_save_re_context(aTHX);
f3548bdc 3558 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3559 sop->op_private |= OPpREFCOUNTED;
3560 /* re_dup will OpREFCNT_inc */
3561 OpREFCNT_set(sop, 1);
569233ed 3562 LEAVE;
c277df42 3563
830247a4
IZ
3564 n = add_data(pRExC_state, 3, "nop");
3565 RExC_rx->data->data[n] = (void*)rop;
3566 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3567 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3568 SvREFCNT_dec(sv);
a0ed51b3 3569 }
e24b16f9 3570 else { /* First pass */
830247a4 3571 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3572 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3573 /* No compiled RE interpolated, has runtime
3574 components ===> unsafe. */
3575 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3576 if (PL_tainting && PL_tainted)
cc6b7395 3577 FAIL("Eval-group in insecure regular expression");
54df2634 3578#if PERL_VERSION > 8
923e4eb5 3579 if (IN_PERL_COMPILETIME)
b5c19bd7 3580 PL_cv_has_eval = 1;
54df2634 3581#endif
c277df42 3582 }
b5c19bd7 3583
830247a4 3584 nextchar(pRExC_state);
6136c704 3585 if (is_logical) {
830247a4 3586 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3587 if (!SIZE_ONLY)
3588 ret->flags = 2;
3dab1dad 3589 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3590 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3591 return ret;
3592 }
ccb2c380
MP
3593 ret = reganode(pRExC_state, EVAL, n);
3594 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3595 Set_Node_Offset(ret, parse_start);
3596 return ret;
c277df42 3597 }
fac92740 3598 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3599 {
fac92740 3600 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3601 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3602 || RExC_parse[1] == '<'
830247a4 3603 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3604 I32 flag;
3605
830247a4 3606 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3607 if (!SIZE_ONLY)
3608 ret->flags = 1;
3dab1dad 3609 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3610 goto insert_if;
b81d288d 3611 }
a0ed51b3