This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
skip t/op/incfilter.t for "make clean;make miniperl;make minitest"
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
125} RExC_state_t;
126
e2509266 127#define RExC_flags (pRExC_state->flags)
830247a4
IZ
128#define RExC_precomp (pRExC_state->precomp)
129#define RExC_rx (pRExC_state->rx)
fac92740 130#define RExC_start (pRExC_state->start)
830247a4
IZ
131#define RExC_end (pRExC_state->end)
132#define RExC_parse (pRExC_state->parse)
133#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 134#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 135#define RExC_emit (pRExC_state->emit)
fac92740 136#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
137#define RExC_naughty (pRExC_state->naughty)
138#define RExC_sawback (pRExC_state->sawback)
139#define RExC_seen (pRExC_state->seen)
140#define RExC_size (pRExC_state->size)
141#define RExC_npar (pRExC_state->npar)
142#define RExC_extralen (pRExC_state->extralen)
143#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
144#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 145#define RExC_utf8 (pRExC_state->utf8)
830247a4 146
a687059c
LW
147#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
148#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
149 ((*s) == '{' && regcurly(s)))
a687059c 150
35c8bce7
LW
151#ifdef SPSTART
152#undef SPSTART /* dratted cpp namespace... */
153#endif
a687059c
LW
154/*
155 * Flags to be passed up and down.
156 */
a687059c 157#define WORST 0 /* Worst case. */
821b33a5 158#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
159#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
160#define SPSTART 0x4 /* Starts with * or +. */
161#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 162
2c2d71f5
JH
163/* Length of a variant. */
164
165typedef struct scan_data_t {
166 I32 len_min;
167 I32 len_delta;
168 I32 pos_min;
169 I32 pos_delta;
170 SV *last_found;
171 I32 last_end; /* min value, <0 unless valid. */
172 I32 last_start_min;
173 I32 last_start_max;
174 SV **longest; /* Either &l_fixed, or &l_float. */
175 SV *longest_fixed;
176 I32 offset_fixed;
177 SV *longest_float;
178 I32 offset_float_min;
179 I32 offset_float_max;
180 I32 flags;
181 I32 whilem_c;
cb434fcc 182 I32 *last_closep;
653099ff 183 struct regnode_charclass_class *start_class;
2c2d71f5
JH
184} scan_data_t;
185
a687059c 186/*
e50aee73 187 * Forward declarations for pregcomp()'s friends.
a687059c 188 */
a0d0e21e 189
27da23d5
JH
190static const scan_data_t zero_scan_data =
191 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
192
193#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
194#define SF_BEFORE_SEOL 0x1
195#define SF_BEFORE_MEOL 0x2
196#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
197#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
198
09b7f37c
CB
199#ifdef NO_UNARY_PLUS
200# define SF_FIX_SHIFT_EOL (0+2)
201# define SF_FL_SHIFT_EOL (0+4)
202#else
203# define SF_FIX_SHIFT_EOL (+2)
204# define SF_FL_SHIFT_EOL (+4)
205#endif
c277df42
IZ
206
207#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
208#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
209
210#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
211#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
212#define SF_IS_INF 0x40
213#define SF_HAS_PAR 0x80
214#define SF_IN_PAR 0x100
215#define SF_HAS_EVAL 0x200
4bfe0158 216#define SCF_DO_SUBSTR 0x400
653099ff
GS
217#define SCF_DO_STCLASS_AND 0x0800
218#define SCF_DO_STCLASS_OR 0x1000
219#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 220#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 221
eb160463 222#define UTF (RExC_utf8 != 0)
e2509266
JH
223#define LOC ((RExC_flags & PMf_LOCALE) != 0)
224#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 225
ffc61ed2 226#define OOB_UNICODE 12345678
93733859 227#define OOB_NAMEDCLASS -1
b8c5462f 228
a0ed51b3
LW
229#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
230#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
231
8615cb43 232
b45f050a
JF
233/* length of regex to show in messages that don't mark a position within */
234#define RegexLengthToShowInErrorMessages 127
235
236/*
237 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
238 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
239 * op/pragma/warn/regcomp.
240 */
7253e4e3
RK
241#define MARKER1 "<-- HERE" /* marker as it appears in the description */
242#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 243
7253e4e3 244#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
245
246/*
247 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
248 * arg. Show regex, up to a maximum length. If it's too long, chop and add
249 * "...".
250 */
ccb2c380 251#define FAIL(msg) STMT_START { \
bfed75c6 252 const char *ellipses = ""; \
ccb2c380
MP
253 IV len = RExC_end - RExC_precomp; \
254 \
255 if (!SIZE_ONLY) \
256 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
257 if (len > RegexLengthToShowInErrorMessages) { \
258 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
259 len = RegexLengthToShowInErrorMessages - 10; \
260 ellipses = "..."; \
261 } \
262 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
263 msg, (int)len, RExC_precomp, ellipses); \
264} STMT_END
8615cb43 265
b45f050a 266/*
b45f050a
JF
267 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
268 */
ccb2c380 269#define Simple_vFAIL(m) STMT_START { \
a28509cc 270 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
271 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
272 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
273} STMT_END
b45f050a
JF
274
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
277 */
ccb2c380
MP
278#define vFAIL(m) STMT_START { \
279 if (!SIZE_ONLY) \
280 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
281 Simple_vFAIL(m); \
282} STMT_END
b45f050a
JF
283
284/*
285 * Like Simple_vFAIL(), but accepts two arguments.
286 */
ccb2c380 287#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 288 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
289 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
290 (int)offset, RExC_precomp, RExC_precomp + offset); \
291} STMT_END
b45f050a
JF
292
293/*
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
295 */
ccb2c380
MP
296#define vFAIL2(m,a1) STMT_START { \
297 if (!SIZE_ONLY) \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
299 Simple_vFAIL2(m, a1); \
300} STMT_END
b45f050a
JF
301
302
303/*
304 * Like Simple_vFAIL(), but accepts three arguments.
305 */
ccb2c380 306#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 307 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
308 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
309 (int)offset, RExC_precomp, RExC_precomp + offset); \
310} STMT_END
b45f050a
JF
311
312/*
313 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
314 */
ccb2c380
MP
315#define vFAIL3(m,a1,a2) STMT_START { \
316 if (!SIZE_ONLY) \
317 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
318 Simple_vFAIL3(m, a1, a2); \
319} STMT_END
b45f050a
JF
320
321/*
322 * Like Simple_vFAIL(), but accepts four arguments.
323 */
ccb2c380 324#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 325 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
328} STMT_END
b45f050a 329
ccb2c380 330#define vWARN(loc,m) STMT_START { \
a28509cc 331 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
332 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
333 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
334} STMT_END
335
336#define vWARNdep(loc,m) STMT_START { \
a28509cc 337 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
338 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
339 "%s" REPORT_LOCATION, \
340 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
341} STMT_END
342
343
344#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 345 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
346 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
347 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
348} STMT_END
349
350#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 351 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
352 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
353 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
354} STMT_END
355
356#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 357 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
358 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
359 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
360} STMT_END
361
362#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 363 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
9d1d55b5 367
8615cb43 368
cd439c50 369/* Allow for side effects in s */
ccb2c380
MP
370#define REGC(c,s) STMT_START { \
371 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
372} STMT_END
cd439c50 373
fac92740
MJD
374/* Macros for recording node offsets. 20001227 mjd@plover.com
375 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
376 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
377 * Element 0 holds the number n.
378 */
379
380#define MJD_OFFSET_DEBUG(x)
a3621e74 381/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
382
383
384#define Set_Node_Offset_To_R(node,byte) STMT_START { \
385 if (! SIZE_ONLY) { \
386 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
387 __LINE__, (node), (byte))); \
388 if((node) < 0) { \
551405c4 389 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
390 } else { \
391 RExC_offsets[2*(node)-1] = (byte); \
392 } \
393 } \
394} STMT_END
395
396#define Set_Node_Offset(node,byte) \
397 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
398#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
399
400#define Set_Node_Length_To_R(node,len) STMT_START { \
401 if (! SIZE_ONLY) { \
402 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 403 __LINE__, (int)(node), (int)(len))); \
ccb2c380 404 if((node) < 0) { \
551405c4 405 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
406 } else { \
407 RExC_offsets[2*(node)] = (len); \
408 } \
409 } \
410} STMT_END
411
412#define Set_Node_Length(node,len) \
413 Set_Node_Length_To_R((node)-RExC_emit_start, len)
414#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
415#define Set_Node_Cur_Length(node) \
416 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
417
418/* Get offsets and lengths */
419#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
420#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
421
acfe0abc 422static void clear_re(pTHX_ void *r);
4327152a 423
653099ff
GS
424/* Mark that we cannot extend a found fixed substring at this point.
425 Updata the longest found anchored substring and the longest found
426 floating substrings if needed. */
427
4327152a 428STATIC void
097eb12c 429S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 430{
e1ec3a88
AL
431 const STRLEN l = CHR_SVLEN(data->last_found);
432 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 433
c277df42 434 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 435 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
436 if (*data->longest == data->longest_fixed) {
437 data->offset_fixed = l ? data->last_start_min : data->pos_min;
438 if (data->flags & SF_BEFORE_EOL)
b81d288d 439 data->flags
c277df42
IZ
440 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
441 else
442 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
443 }
444 else {
c277df42 445 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
446 data->offset_float_max = (l
447 ? data->last_start_max
c277df42 448 : data->pos_min + data->pos_delta);
9051bda5
HS
449 if ((U32)data->offset_float_max > (U32)I32_MAX)
450 data->offset_float_max = I32_MAX;
c277df42 451 if (data->flags & SF_BEFORE_EOL)
b81d288d 452 data->flags
c277df42
IZ
453 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
454 else
455 data->flags &= ~SF_FL_BEFORE_EOL;
456 }
457 }
458 SvCUR_set(data->last_found, 0);
0eda9292 459 {
a28509cc 460 SV * const sv = data->last_found;
097eb12c
AL
461 if (SvUTF8(sv) && SvMAGICAL(sv)) {
462 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
463 if (mg)
464 mg->mg_len = 0;
465 }
0eda9292 466 }
c277df42
IZ
467 data->last_end = -1;
468 data->flags &= ~SF_BEFORE_EOL;
469}
470
653099ff
GS
471/* Can match anything (initialization) */
472STATIC void
097eb12c 473S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 474{
653099ff 475 ANYOF_CLASS_ZERO(cl);
f8bef550 476 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 477 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
478 if (LOC)
479 cl->flags |= ANYOF_LOCALE;
480}
481
482/* Can match anything (initialization) */
483STATIC int
5f66b61c 484S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
485{
486 int value;
487
aaa51d5e 488 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
489 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
490 return 1;
1aa99e6b
IH
491 if (!(cl->flags & ANYOF_UNICODE_ALL))
492 return 0;
f8bef550
NC
493 if (!ANYOF_BITMAP_TESTALLSET(cl))
494 return 0;
653099ff
GS
495 return 1;
496}
497
498/* Can match anything (initialization) */
499STATIC void
097eb12c 500S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 501{
8ecf7187 502 Zero(cl, 1, struct regnode_charclass_class);
653099ff 503 cl->type = ANYOF;
830247a4 504 cl_anything(pRExC_state, cl);
653099ff
GS
505}
506
507STATIC void
097eb12c 508S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 509{
8ecf7187 510 Zero(cl, 1, struct regnode_charclass_class);
653099ff 511 cl->type = ANYOF;
830247a4 512 cl_anything(pRExC_state, cl);
653099ff
GS
513 if (LOC)
514 cl->flags |= ANYOF_LOCALE;
515}
516
517/* 'And' a given class with another one. Can create false positives */
518/* We assume that cl is not inverted */
519STATIC void
5f66b61c 520S_cl_and(struct regnode_charclass_class *cl,
a28509cc 521 const struct regnode_charclass_class *and_with)
653099ff 522{
653099ff
GS
523 if (!(and_with->flags & ANYOF_CLASS)
524 && !(cl->flags & ANYOF_CLASS)
525 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
526 && !(and_with->flags & ANYOF_FOLD)
527 && !(cl->flags & ANYOF_FOLD)) {
528 int i;
529
530 if (and_with->flags & ANYOF_INVERT)
531 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
532 cl->bitmap[i] &= ~and_with->bitmap[i];
533 else
534 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
535 cl->bitmap[i] &= and_with->bitmap[i];
536 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
537 if (!(and_with->flags & ANYOF_EOS))
538 cl->flags &= ~ANYOF_EOS;
1aa99e6b 539
14ebb1a2
JH
540 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
541 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
542 cl->flags &= ~ANYOF_UNICODE_ALL;
543 cl->flags |= ANYOF_UNICODE;
544 ARG_SET(cl, ARG(and_with));
545 }
14ebb1a2
JH
546 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
547 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 548 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
549 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
550 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 551 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
552}
553
554/* 'OR' a given class with another one. Can create false positives */
555/* We assume that cl is not inverted */
556STATIC void
097eb12c 557S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 558{
653099ff
GS
559 if (or_with->flags & ANYOF_INVERT) {
560 /* We do not use
561 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
562 * <= (B1 | !B2) | (CL1 | !CL2)
563 * which is wasteful if CL2 is small, but we ignore CL2:
564 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
565 * XXXX Can we handle case-fold? Unclear:
566 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
567 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
568 */
569 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
570 && !(or_with->flags & ANYOF_FOLD)
571 && !(cl->flags & ANYOF_FOLD) ) {
572 int i;
573
574 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
575 cl->bitmap[i] |= ~or_with->bitmap[i];
576 } /* XXXX: logic is complicated otherwise */
577 else {
830247a4 578 cl_anything(pRExC_state, cl);
653099ff
GS
579 }
580 } else {
581 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
582 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 583 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
584 || (cl->flags & ANYOF_FOLD)) ) {
585 int i;
586
587 /* OR char bitmap and class bitmap separately */
588 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
589 cl->bitmap[i] |= or_with->bitmap[i];
590 if (or_with->flags & ANYOF_CLASS) {
591 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
592 cl->classflags[i] |= or_with->classflags[i];
593 cl->flags |= ANYOF_CLASS;
594 }
595 }
596 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 597 cl_anything(pRExC_state, cl);
653099ff
GS
598 }
599 }
600 if (or_with->flags & ANYOF_EOS)
601 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
602
603 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
604 ARG(cl) != ARG(or_with)) {
605 cl->flags |= ANYOF_UNICODE_ALL;
606 cl->flags &= ~ANYOF_UNICODE;
607 }
608 if (or_with->flags & ANYOF_UNICODE_ALL) {
609 cl->flags |= ANYOF_UNICODE_ALL;
610 cl->flags &= ~ANYOF_UNICODE;
611 }
653099ff
GS
612}
613
5d1c421c 614/*
a3621e74
YO
615
616 make_trie(startbranch,first,last,tail,flags)
617 startbranch: the first branch in the whole branch sequence
618 first : start branch of sequence of branch-exact nodes.
619 May be the same as startbranch
620 last : Thing following the last branch.
621 May be the same as tail.
622 tail : item following the branch sequence
623 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
624
625Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
626
627A trie is an N'ary tree where the branches are determined by digital
628decomposition of the key. IE, at the root node you look up the 1st character and
629follow that branch repeat until you find the end of the branches. Nodes can be
630marked as "accepting" meaning they represent a complete word. Eg:
631
632 /he|she|his|hers/
633
634would convert into the following structure. Numbers represent states, letters
635following numbers represent valid transitions on the letter from that state, if
636the number is in square brackets it represents an accepting state, otherwise it
637will be in parenthesis.
638
639 +-h->+-e->[3]-+-r->(8)-+-s->[9]
640 | |
641 | (2)
642 | |
643 (1) +-i->(6)-+-s->[7]
644 |
645 +-s->(3)-+-h->(4)-+-e->[5]
646
647 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
648
649This shows that when matching against the string 'hers' we will begin at state 1
650read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
651then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
652is also accepting. Thus we know that we can match both 'he' and 'hers' with a
653single traverse. We store a mapping from accepting to state to which word was
654matched, and then when we have multiple possibilities we try to complete the
655rest of the regex in the order in which they occured in the alternation.
656
657The only prior NFA like behaviour that would be changed by the TRIE support is
658the silent ignoring of duplicate alternations which are of the form:
659
660 / (DUPE|DUPE) X? (?{ ... }) Y /x
661
662Thus EVAL blocks follwing a trie may be called a different number of times with
663and without the optimisation. With the optimisations dupes will be silently
664ignored. This inconsistant behaviour of EVAL type nodes is well established as
665the following demonstrates:
666
667 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
668
669which prints out 'word' three times, but
670
671 'words'=~/(word|word|word)(?{ print $1 })S/
672
673which doesnt print it out at all. This is due to other optimisations kicking in.
674
675Example of what happens on a structural level:
676
677The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
678
679 1: CURLYM[1] {1,32767}(18)
680 5: BRANCH(8)
681 6: EXACT <ac>(16)
682 8: BRANCH(11)
683 9: EXACT <ad>(16)
684 11: BRANCH(14)
685 12: EXACT <ab>(16)
686 16: SUCCEED(0)
687 17: NOTHING(18)
688 18: END(0)
689
690This would be optimizable with startbranch=5, first=5, last=16, tail=16
691and should turn into:
692
693 1: CURLYM[1] {1,32767}(18)
694 5: TRIE(16)
695 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
696 <ac>
697 <ad>
698 <ab>
699 16: SUCCEED(0)
700 17: NOTHING(18)
701 18: END(0)
702
703Cases where tail != last would be like /(?foo|bar)baz/:
704
705 1: BRANCH(4)
706 2: EXACT <foo>(8)
707 4: BRANCH(7)
708 5: EXACT <bar>(8)
709 7: TAIL(8)
710 8: EXACT <baz>(10)
711 10: END(0)
712
713which would be optimizable with startbranch=1, first=1, last=7, tail=8
714and would end up looking like:
715
716 1: TRIE(8)
717 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
718 <foo>
719 <bar>
720 7: TAIL(8)
721 8: EXACT <baz>(10)
722 10: END(0)
723
724*/
725
726#define TRIE_DEBUG_CHAR \
727 DEBUG_TRIE_COMPILE_r({ \
728 SV *tmp; \
729 if ( UTF ) { \
6136c704 730 tmp = newSVpvs( "" ); \
a3621e74
YO
731 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
732 } else { \
e4584336 733 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
734 } \
735 av_push( trie->revcharmap, tmp ); \
736 })
737
738#define TRIE_READ_CHAR STMT_START { \
739 if ( UTF ) { \
740 if ( folder ) { \
741 if ( foldlen > 0 ) { \
742 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
743 foldlen -= len; \
744 scan += len; \
745 len = 0; \
746 } else { \
e1ec3a88 747 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
748 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
749 foldlen -= UNISKIP( uvc ); \
750 scan = foldbuf + UNISKIP( uvc ); \
751 } \
752 } else { \
e1ec3a88 753 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
754 } \
755 } else { \
756 uvc = (U32)*uc; \
757 len = 1; \
758 } \
759} STMT_END
760
761
762#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
763#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
764#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
765#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
766
767#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
768 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
769 TRIE_LIST_LEN( state ) *= 2; \
770 Renew( trie->states[ state ].trans.list, \
771 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
772 } \
773 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
774 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
775 TRIE_LIST_CUR( state )++; \
776} STMT_END
777
778#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 779 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
780 4, reg_trie_trans_le ); \
781 TRIE_LIST_CUR( state ) = 1; \
782 TRIE_LIST_LEN( state ) = 4; \
783} STMT_END
784
785STATIC I32
786S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
787{
27da23d5 788 dVAR;
a3621e74
YO
789 /* first pass, loop through and scan words */
790 reg_trie_data *trie;
791 regnode *cur;
9f7f3913 792 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
793 STRLEN len = 0;
794 UV uvc = 0;
795 U16 curword = 0;
796 U32 next_alloc = 0;
797 /* we just use folder as a flag in utf8 */
e1ec3a88 798 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
799 ? PL_fold
800 : ( flags == EXACTFL
801 ? PL_fold_locale
802 : NULL
803 )
804 );
805
e1ec3a88 806 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
807 SV *re_trie_maxbuff;
808
809 GET_RE_DEBUG_FLAGS_DECL;
810
a02a5408 811 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
812 trie->refcount = 1;
813 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 814 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
815 DEBUG_r({
816 trie->words = newAV();
817 trie->revcharmap = newAV();
818 });
819
820
0111c4fd 821 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 822 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 823 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
824 }
825
826 /* -- First loop and Setup --
827
828 We first traverse the branches and scan each word to determine if it
829 contains widechars, and how many unique chars there are, this is
830 important as we have to build a table with at least as many columns as we
831 have unique chars.
832
833 We use an array of integers to represent the character codes 0..255
834 (trie->charmap) and we use a an HV* to store unicode characters. We use the
835 native representation of the character value as the key and IV's for the
836 coded index.
837
838 *TODO* If we keep track of how many times each character is used we can
839 remap the columns so that the table compression later on is more
840 efficient in terms of memory by ensuring most common value is in the
841 middle and the least common are on the outside. IMO this would be better
842 than a most to least common mapping as theres a decent chance the most
843 common letter will share a node with the least common, meaning the node
844 will not be compressable. With a middle is most common approach the worst
845 case is when we have the least common nodes twice.
846
847 */
848
849
850 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 851 regnode * const noper = NEXTOPER( cur );
e1ec3a88 852 const U8 *uc = (U8*)STRING( noper );
a28509cc 853 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
854 STRLEN foldlen = 0;
855 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 856 const U8 *scan = (U8*)NULL;
a3621e74
YO
857
858 for ( ; uc < e ; uc += len ) {
859 trie->charcount++;
860 TRIE_READ_CHAR;
861 if ( uvc < 256 ) {
862 if ( !trie->charmap[ uvc ] ) {
863 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
864 if ( folder )
865 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
866 TRIE_DEBUG_CHAR;
867 }
868 } else {
869 SV** svpp;
870 if ( !trie->widecharmap )
871 trie->widecharmap = newHV();
872
873 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
874
875 if ( !svpp )
e4584336 876 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
877
878 if ( !SvTRUE( *svpp ) ) {
879 sv_setiv( *svpp, ++trie->uniquecharcount );
880 TRIE_DEBUG_CHAR;
881 }
882 }
883 }
884 trie->wordcount++;
885 } /* end first pass */
886 DEBUG_TRIE_COMPILE_r(
887 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
888 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 889 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
890 );
891
892
893 /*
894 We now know what we are dealing with in terms of unique chars and
895 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
896 representation using a flat table will take. If it's over a reasonable
897 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
898 conservative but potentially much slower representation using an array
899 of lists.
900
901 At the end we convert both representations into the same compressed
902 form that will be used in regexec.c for matching with. The latter
903 is a form that cannot be used to construct with but has memory
904 properties similar to the list form and access properties similar
905 to the table form making it both suitable for fast searches and
906 small enough that its feasable to store for the duration of a program.
907
908 See the comment in the code where the compressed table is produced
909 inplace from the flat tabe representation for an explanation of how
910 the compression works.
911
912 */
913
914
915 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
916 /*
917 Second Pass -- Array Of Lists Representation
918
919 Each state will be represented by a list of charid:state records
920 (reg_trie_trans_le) the first such element holds the CUR and LEN
921 points of the allocated array. (See defines above).
922
923 We build the initial structure using the lists, and then convert
924 it into the compressed table form which allows faster lookups
925 (but cant be modified once converted).
926
927
928 */
929
930
931 STRLEN transcount = 1;
932
a02a5408 933 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
934 TRIE_LIST_NEW(1);
935 next_alloc = 2;
936
937 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
938
c445ea15
AL
939 regnode * const noper = NEXTOPER( cur );
940 U8 *uc = (U8*)STRING( noper );
941 const U8 * const e = uc + STR_LEN( noper );
942 U32 state = 1; /* required init */
943 U16 charid = 0; /* sanity init */
944 U8 *scan = (U8*)NULL; /* sanity init */
945 STRLEN foldlen = 0; /* required init */
946 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
947
948 for ( ; uc < e ; uc += len ) {
949
950 TRIE_READ_CHAR;
951
952 if ( uvc < 256 ) {
953 charid = trie->charmap[ uvc ];
954 } else {
955 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
956 if ( !svpp ) {
957 charid = 0;
958 } else {
959 charid=(U16)SvIV( *svpp );
960 }
961 }
962 if ( charid ) {
a3621e74 963
c445ea15
AL
964 U16 check;
965 U32 newstate = 0;
a3621e74 966
c445ea15
AL
967 charid--;
968 if ( !trie->states[ state ].trans.list ) {
969 TRIE_LIST_NEW( state );
970 }
971 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
972 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
973 newstate = TRIE_LIST_ITEM( state, check ).newstate;
974 break;
975 }
976 }
977 if ( ! newstate ) {
978 newstate = next_alloc++;
979 TRIE_LIST_PUSH( state, charid, newstate );
980 transcount++;
981 }
982 state = newstate;
983 } else {
984 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 985 }
c445ea15
AL
986 /* charid is now 0 if we dont know the char read, or nonzero if we do */
987 }
a3621e74 988
c445ea15
AL
989 if ( !trie->states[ state ].wordnum ) {
990 /* we havent inserted this word into the structure yet. */
991 trie->states[ state ].wordnum = ++curword;
a3621e74 992
c445ea15
AL
993 DEBUG_r({
994 /* store the word for dumping */
995 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
996 if ( UTF ) SvUTF8_on( tmp );
997 av_push( trie->words, tmp );
998 });
a3621e74 999
c445ea15 1000 } else {
6f207bd3 1001 NOOP; /* It's a dupe. So ignore it. */
c445ea15 1002 }
a3621e74
YO
1003
1004 } /* end second pass */
1005
1006 trie->laststate = next_alloc;
1007 Renew( trie->states, next_alloc, reg_trie_state );
1008
1009 DEBUG_TRIE_COMPILE_MORE_r({
1010 U32 state;
a3621e74 1011
a28509cc 1012 /* print out the table precompression. */
a3621e74
YO
1013
1014 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1015 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1016
1017 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1018 U16 charid;
a3621e74 1019
e4584336 1020 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1021 if ( ! trie->states[ state ].wordnum ) {
1022 PerlIO_printf( Perl_debug_log, "%5s| ","");
1023 } else {
e4584336 1024 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1025 trie->states[ state ].wordnum
1026 );
1027 }
1028 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1029 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1030 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1031 SvPV_nolen_const( *tmp ),
a3621e74 1032 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1033 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1034 );
1035 }
1036
1037 }
1038 PerlIO_printf( Perl_debug_log, "\n\n" );
1039 });
1040
a02a5408 1041 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1042 {
1043 U32 state;
a3621e74
YO
1044 U32 tp = 0;
1045 U32 zp = 0;
1046
1047
1048 for( state=1 ; state < next_alloc ; state ++ ) {
1049 U32 base=0;
1050
1051 /*
1052 DEBUG_TRIE_COMPILE_MORE_r(
1053 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1054 );
1055 */
1056
1057 if (trie->states[state].trans.list) {
1058 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1059 U16 maxid=minid;
a28509cc 1060 U16 idx;
a3621e74
YO
1061
1062 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1063 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1064 if ( forid < minid ) {
1065 minid=forid;
1066 } else if ( forid > maxid ) {
1067 maxid=forid;
1068 }
a3621e74
YO
1069 }
1070 if ( transcount < tp + maxid - minid + 1) {
1071 transcount *= 2;
1072 Renew( trie->trans, transcount, reg_trie_trans );
1073 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1074 }
1075 base = trie->uniquecharcount + tp - minid;
1076 if ( maxid == minid ) {
1077 U32 set = 0;
1078 for ( ; zp < tp ; zp++ ) {
1079 if ( ! trie->trans[ zp ].next ) {
1080 base = trie->uniquecharcount + zp - minid;
1081 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1082 trie->trans[ zp ].check = state;
1083 set = 1;
1084 break;
1085 }
1086 }
1087 if ( !set ) {
1088 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1089 trie->trans[ tp ].check = state;
1090 tp++;
1091 zp = tp;
1092 }
1093 } else {
1094 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1095 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1096 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1097 trie->trans[ tid ].check = state;
1098 }
1099 tp += ( maxid - minid + 1 );
1100 }
1101 Safefree(trie->states[ state ].trans.list);
1102 }
1103 /*
1104 DEBUG_TRIE_COMPILE_MORE_r(
1105 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1106 );
1107 */
1108 trie->states[ state ].trans.base=base;
1109 }
cc601c31 1110 trie->lasttrans = tp + 1;
a3621e74
YO
1111 }
1112 } else {
1113 /*
1114 Second Pass -- Flat Table Representation.
1115
1116 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1117 We know that we will need Charcount+1 trans at most to store the data
1118 (one row per char at worst case) So we preallocate both structures
1119 assuming worst case.
1120
1121 We then construct the trie using only the .next slots of the entry
1122 structs.
1123
1124 We use the .check field of the first entry of the node temporarily to
1125 make compression both faster and easier by keeping track of how many non
1126 zero fields are in the node.
1127
1128 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1129 transition.
1130
1131 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1132 number representing the first entry of the node, and state as a
1133 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1134 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1135 are 2 entrys per node. eg:
1136
1137 A B A B
1138 1. 2 4 1. 3 7
1139 2. 0 3 3. 0 5
1140 3. 0 0 5. 0 0
1141 4. 0 0 7. 0 0
1142
1143 The table is internally in the right hand, idx form. However as we also
1144 have to deal with the states array which is indexed by nodenum we have to
1145 use TRIE_NODENUM() to convert.
1146
1147 */
1148
a02a5408 1149 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1150 reg_trie_trans );
a02a5408 1151 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1152 next_alloc = trie->uniquecharcount + 1;
1153
1154 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1155
c445ea15 1156 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1157 const U8 *uc = (U8*)STRING( noper );
1158 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1159
1160 U32 state = 1; /* required init */
1161
1162 U16 charid = 0; /* sanity init */
1163 U32 accept_state = 0; /* sanity init */
1164 U8 *scan = (U8*)NULL; /* sanity init */
1165
1166 STRLEN foldlen = 0; /* required init */
1167 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1168
1169
1170 for ( ; uc < e ; uc += len ) {
1171
1172 TRIE_READ_CHAR;
1173
1174 if ( uvc < 256 ) {
1175 charid = trie->charmap[ uvc ];
1176 } else {
c445ea15
AL
1177 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1178 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1179 }
1180 if ( charid ) {
1181 charid--;
1182 if ( !trie->trans[ state + charid ].next ) {
1183 trie->trans[ state + charid ].next = next_alloc;
1184 trie->trans[ state ].check++;
1185 next_alloc += trie->uniquecharcount;
1186 }
1187 state = trie->trans[ state + charid ].next;
1188 } else {
e4584336 1189 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1190 }
1191 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1192 }
1193
1194 accept_state = TRIE_NODENUM( state );
1195 if ( !trie->states[ accept_state ].wordnum ) {
1196 /* we havent inserted this word into the structure yet. */
1197 trie->states[ accept_state ].wordnum = ++curword;
1198
1199 DEBUG_r({
1200 /* store the word for dumping */
1201 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1202 if ( UTF ) SvUTF8_on( tmp );
1203 av_push( trie->words, tmp );
1204 });
1205
1206 } else {
6f207bd3 1207 NOOP; /* Its a dupe. So ignore it. */
a3621e74
YO
1208 }
1209
1210 } /* end second pass */
1211
1212 DEBUG_TRIE_COMPILE_MORE_r({
1213 /*
1214 print out the table precompression so that we can do a visual check
1215 that they are identical.
1216 */
1217 U32 state;
1218 U16 charid;
1219 PerlIO_printf( Perl_debug_log, "\nChar : " );
1220
1221 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1222 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1223 if ( tmp ) {
cfd0369c 1224 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1225 }
1226 }
1227
1228 PerlIO_printf( Perl_debug_log, "\nState+-" );
1229
1230 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1231 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1232 }
1233
1234 PerlIO_printf( Perl_debug_log, "\n" );
1235
1236 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1237
e4584336 1238 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1239
1240 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1241 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1242 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1243 }
1244 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1245 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1246 } else {
e4584336 1247 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1248 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1249 }
1250 }
1251 PerlIO_printf( Perl_debug_log, "\n\n" );
1252 });
1253 {
1254 /*
1255 * Inplace compress the table.*
1256
1257 For sparse data sets the table constructed by the trie algorithm will
1258 be mostly 0/FAIL transitions or to put it another way mostly empty.
1259 (Note that leaf nodes will not contain any transitions.)
1260
1261 This algorithm compresses the tables by eliminating most such
1262 transitions, at the cost of a modest bit of extra work during lookup:
1263
1264 - Each states[] entry contains a .base field which indicates the
1265 index in the state[] array wheres its transition data is stored.
1266
1267 - If .base is 0 there are no valid transitions from that node.
1268
1269 - If .base is nonzero then charid is added to it to find an entry in
1270 the trans array.
1271
1272 -If trans[states[state].base+charid].check!=state then the
1273 transition is taken to be a 0/Fail transition. Thus if there are fail
1274 transitions at the front of the node then the .base offset will point
1275 somewhere inside the previous nodes data (or maybe even into a node
1276 even earlier), but the .check field determines if the transition is
1277 valid.
1278
1279 The following process inplace converts the table to the compressed
1280 table: We first do not compress the root node 1,and mark its all its
1281 .check pointers as 1 and set its .base pointer as 1 as well. This
1282 allows to do a DFA construction from the compressed table later, and
1283 ensures that any .base pointers we calculate later are greater than
1284 0.
1285
1286 - We set 'pos' to indicate the first entry of the second node.
1287
1288 - We then iterate over the columns of the node, finding the first and
1289 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1290 and set the .check pointers accordingly, and advance pos
1291 appropriately and repreat for the next node. Note that when we copy
1292 the next pointers we have to convert them from the original
1293 NODEIDX form to NODENUM form as the former is not valid post
1294 compression.
1295
1296 - If a node has no transitions used we mark its base as 0 and do not
1297 advance the pos pointer.
1298
1299 - If a node only has one transition we use a second pointer into the
1300 structure to fill in allocated fail transitions from other states.
1301 This pointer is independent of the main pointer and scans forward
1302 looking for null transitions that are allocated to a state. When it
1303 finds one it writes the single transition into the "hole". If the
1304 pointer doesnt find one the single transition is appeneded as normal.
1305
1306 - Once compressed we can Renew/realloc the structures to release the
1307 excess space.
1308
1309 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1310 specifically Fig 3.47 and the associated pseudocode.
1311
1312 demq
1313 */
a3b680e6 1314 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1315 U32 state, charid;
a3621e74
YO
1316 U32 pos = 0, zp=0;
1317 trie->laststate = laststate;
1318
1319 for ( state = 1 ; state < laststate ; state++ ) {
1320 U8 flag = 0;
a28509cc
AL
1321 const U32 stateidx = TRIE_NODEIDX( state );
1322 const U32 o_used = trie->trans[ stateidx ].check;
1323 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1324 trie->trans[ stateidx ].check = 0;
1325
1326 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1327 if ( flag || trie->trans[ stateidx + charid ].next ) {
1328 if ( trie->trans[ stateidx + charid ].next ) {
1329 if (o_used == 1) {
1330 for ( ; zp < pos ; zp++ ) {
1331 if ( ! trie->trans[ zp ].next ) {
1332 break;
1333 }
1334 }
1335 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1336 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1337 trie->trans[ zp ].check = state;
1338 if ( ++zp > pos ) pos = zp;
1339 break;
1340 }
1341 used--;
1342 }
1343 if ( !flag ) {
1344 flag = 1;
1345 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1346 }
1347 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1348 trie->trans[ pos ].check = state;
1349 pos++;
1350 }
1351 }
1352 }
cc601c31 1353 trie->lasttrans = pos + 1;
a3621e74
YO
1354 Renew( trie->states, laststate + 1, reg_trie_state);
1355 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1356 PerlIO_printf( Perl_debug_log,
1357 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1358 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1359 (IV)next_alloc,
1360 (IV)pos,
a3621e74
YO
1361 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1362 );
1363
1364 } /* end table compress */
1365 }
cc601c31
YO
1366 /* resize the trans array to remove unused space */
1367 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1368
1369 DEBUG_TRIE_COMPILE_r({
1370 U32 state;
1371 /*
1372 Now we print it out again, in a slightly different form as there is additional
1373 info we want to be able to see when its compressed. They are close enough for
1374 visual comparison though.
1375 */
1376 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1377
1378 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1379 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1380 if ( tmp ) {
cfd0369c 1381 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1382 }
1383 }
1384 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1385
a3621e74
YO
1386 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1387 PerlIO_printf( Perl_debug_log, "-----");
1388 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1389
a3621e74 1390 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1391 const U32 base = trie->states[ state ].trans.base;
a3621e74 1392
e4584336 1393 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1394
1395 if ( trie->states[ state ].wordnum ) {
1396 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1397 } else {
1398 PerlIO_printf( Perl_debug_log, "%6s", "" );
1399 }
1400
e4584336 1401 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1402
1403 if ( base ) {
1404 U32 ofs = 0;
1405
cc601c31
YO
1406 while( ( base + ofs < trie->uniquecharcount ) ||
1407 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1408 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1409 ofs++;
1410
e4584336 1411 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1412
1413 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1414 if ( ( base + ofs >= trie->uniquecharcount ) &&
1415 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1416 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1417 {
e4584336
RB
1418 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1419 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1420 } else {
1421 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1422 }
1423 }
1424
e4584336 1425 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1426
1427 }
1428 PerlIO_printf( Perl_debug_log, "\n" );
1429 }
1430 });
1431
1432 {
1433 /* now finally we "stitch in" the new TRIE node
1434 This means we convert either the first branch or the first Exact,
1435 depending on whether the thing following (in 'last') is a branch
1436 or not and whther first is the startbranch (ie is it a sub part of
1437 the alternation or is it the whole thing.)
1438 Assuming its a sub part we conver the EXACT otherwise we convert
1439 the whole branch sequence, including the first.
1440 */
1441 regnode *convert;
1442
1443
1444
1445
1446 if ( first == startbranch && OP( last ) != BRANCH ) {
1447 convert = first;
1448 } else {
1449 convert = NEXTOPER( first );
1450 NEXT_OFF( first ) = (U16)(last - first);
1451 }
1452
1453 OP( convert ) = TRIE + (U8)( flags - EXACT );
1454 NEXT_OFF( convert ) = (U16)(tail - convert);
1455 ARG_SET( convert, data_slot );
1456
1457 /* tells us if we need to handle accept buffers specially */
1458 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1459
1460
1461 /* needed for dumping*/
1462 DEBUG_r({
1463 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1464 /* We now need to mark all of the space originally used by the
1465 branches as optimized away. This keeps the dumpuntil from
1466 throwing a wobbly as it doesnt use regnext() to traverse the
1467 opcodes.
1468 */
1469 while( optimize < last ) {
1470 OP( optimize ) = OPTIMIZED;
1471 optimize++;
1472 }
1473 });
1474 } /* end node insert */
1475 return 1;
1476}
1477
1478
1479
1480/*
5d1c421c
JH
1481 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1482 * These need to be revisited when a newer toolchain becomes available.
1483 */
1484#if defined(__sparc64__) && defined(__GNUC__)
1485# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1486# undef SPARC64_GCC_WORKAROUND
1487# define SPARC64_GCC_WORKAROUND 1
1488# endif
1489#endif
1490
653099ff
GS
1491/* REx optimizer. Converts nodes into quickier variants "in place".
1492 Finds fixed substrings. */
1493
a0288114 1494/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1495 to the position after last scanned or to NULL. */
1496
a3621e74 1497
76e3520e 1498STATIC I32
9a957fbc
AL
1499S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1500 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1501 /* scanp: Start here (read-write). */
1502 /* deltap: Write maxlen-minlen here. */
1503 /* last: Stop before this one. */
1504{
97aff369 1505 dVAR;
c277df42
IZ
1506 I32 min = 0, pars = 0, code;
1507 regnode *scan = *scanp, *next;
1508 I32 delta = 0;
1509 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1510 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1511 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1512 scan_data_t data_fake;
653099ff 1513 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1514 SV *re_trie_maxbuff = NULL;
1515
1516 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1517
c277df42
IZ
1518 while (scan && OP(scan) != END && scan < last) {
1519 /* Peephole optimizer: */
a3621e74 1520 DEBUG_OPTIMISE_r({
c445ea15 1521 SV * const mysv=sv_newmortal();
32fc9b6a 1522 regprop(RExC_rx, mysv, scan);
e4584336 1523 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1524 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1525 });
c277df42 1526
22c35a8c 1527 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1528 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1529 regnode *n = regnext(scan);
1530 U32 stringok = 1;
1531#ifdef DEBUGGING
1532 regnode *stop = scan;
b81d288d 1533#endif
c277df42 1534
cd439c50 1535 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1536 /* Skip NOTHING, merge EXACT*. */
1537 while (n &&
b81d288d 1538 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1539 (stringok && (OP(n) == OP(scan))))
1540 && NEXT_OFF(n)
1541 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1542 if (OP(n) == TAIL || n > next)
1543 stringok = 0;
22c35a8c 1544 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1545 NEXT_OFF(scan) += NEXT_OFF(n);
1546 next = n + NODE_STEP_REGNODE;
1547#ifdef DEBUGGING
1548 if (stringok)
1549 stop = n;
b81d288d 1550#endif
c277df42 1551 n = regnext(n);
a0ed51b3 1552 }
f49d4d0f 1553 else if (stringok) {
a3b680e6 1554 const int oldl = STR_LEN(scan);
c445ea15 1555 regnode * const nnext = regnext(n);
f49d4d0f 1556
b81d288d 1557 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1558 break;
1559 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1560 STR_LEN(scan) += STR_LEN(n);
1561 next = n + NODE_SZ_STR(n);
c277df42 1562 /* Now we can overwrite *n : */
f49d4d0f 1563 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1564#ifdef DEBUGGING
f49d4d0f 1565 stop = next - 1;
b81d288d 1566#endif
c277df42
IZ
1567 n = nnext;
1568 }
1569 }
61a36c01 1570
a3621e74 1571 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1572/*
1573 Two problematic code points in Unicode casefolding of EXACT nodes:
1574
1575 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1576 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1577
1578 which casefold to
1579
1580 Unicode UTF-8
1581
1582 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1583 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1584
1585 This means that in case-insensitive matching (or "loose matching",
1586 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1587 length of the above casefolded versions) can match a target string
1588 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1589 This would rather mess up the minimum length computation.
1590
1591 What we'll do is to look for the tail four bytes, and then peek
1592 at the preceding two bytes to see whether we need to decrease
1593 the minimum length by four (six minus two).
1594
1595 Thanks to the design of UTF-8, there cannot be false matches:
1596 A sequence of valid UTF-8 bytes cannot be a subsequence of
1597 another valid sequence of UTF-8 bytes.
1598
1599*/
c445ea15
AL
1600 char * const s0 = STRING(scan), *s, *t;
1601 char * const s1 = s0 + STR_LEN(scan) - 1;
1602 char * const s2 = s1 - 4;
d4c19fe8 1603 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1604 const char * const t1 = t0 + 3;
2af232bd 1605
61a36c01
JH
1606 for (s = s0 + 2;
1607 s < s2 && (t = ninstr(s, s1, t0, t1));
1608 s = t + 4) {
1609 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1610 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1611 min -= 4;
1612 }
1613 }
1614
c277df42
IZ
1615#ifdef DEBUGGING
1616 /* Allow dumping */
cd439c50 1617 n = scan + NODE_SZ_STR(scan);
c277df42 1618 while (n <= stop) {
22c35a8c 1619 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1620 OP(n) = OPTIMIZED;
1621 NEXT_OFF(n) = 0;
1622 }
1623 n++;
1624 }
653099ff 1625#endif
c277df42 1626 }
a3621e74
YO
1627
1628
1629
653099ff
GS
1630 /* Follow the next-chain of the current node and optimize
1631 away all the NOTHINGs from it. */
c277df42 1632 if (OP(scan) != CURLYX) {
a3b680e6 1633 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1634 ? I32_MAX
1635 /* I32 may be smaller than U16 on CRAYs! */
1636 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1637 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1638 int noff;
1639 regnode *n = scan;
b81d288d 1640
c277df42
IZ
1641 /* Skip NOTHING and LONGJMP. */
1642 while ((n = regnext(n))
22c35a8c 1643 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1644 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1645 && off + noff < max)
1646 off += noff;
1647 if (reg_off_by_arg[OP(scan)])
1648 ARG(scan) = off;
b81d288d 1649 else
c277df42
IZ
1650 NEXT_OFF(scan) = off;
1651 }
a3621e74 1652
653099ff
GS
1653 /* The principal pseudo-switch. Cannot be a switch, since we
1654 look into several different things. */
b81d288d 1655 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1656 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1657 next = regnext(scan);
1658 code = OP(scan);
a3621e74 1659 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1660
1661 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1662 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1663 struct regnode_charclass_class accum;
d4c19fe8 1664 regnode * const startbranch=scan;
c277df42 1665
653099ff 1666 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1667 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1668 if (flags & SCF_DO_STCLASS)
830247a4 1669 cl_init_zero(pRExC_state, &accum);
a3621e74 1670
c277df42 1671 while (OP(scan) == code) {
830247a4 1672 I32 deltanext, minnext, f = 0, fake;
653099ff 1673 struct regnode_charclass_class this_class;
c277df42
IZ
1674
1675 num++;
1676 data_fake.flags = 0;
b81d288d 1677 if (data) {
2c2d71f5 1678 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1679 data_fake.last_closep = data->last_closep;
1680 }
1681 else
1682 data_fake.last_closep = &fake;
c277df42
IZ
1683 next = regnext(scan);
1684 scan = NEXTOPER(scan);
1685 if (code != BRANCH)
1686 scan = NEXTOPER(scan);
653099ff 1687 if (flags & SCF_DO_STCLASS) {
830247a4 1688 cl_init(pRExC_state, &this_class);
653099ff
GS
1689 data_fake.start_class = &this_class;
1690 f = SCF_DO_STCLASS_AND;
b81d288d 1691 }
e1901655
IZ
1692 if (flags & SCF_WHILEM_VISITED_POS)
1693 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1694
653099ff 1695 /* we suppose the run is continuous, last=next...*/
830247a4 1696 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1697 next, &data_fake, f,depth+1);
b81d288d 1698 if (min1 > minnext)
c277df42
IZ
1699 min1 = minnext;
1700 if (max1 < minnext + deltanext)
1701 max1 = minnext + deltanext;
1702 if (deltanext == I32_MAX)
aca2d497 1703 is_inf = is_inf_internal = 1;
c277df42
IZ
1704 scan = next;
1705 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1706 pars++;
405ff068 1707 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1708 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1709 if (data)
1710 data->whilem_c = data_fake.whilem_c;
653099ff 1711 if (flags & SCF_DO_STCLASS)
830247a4 1712 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1713 if (code == SUSPEND)
c277df42
IZ
1714 break;
1715 }
1716 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1717 min1 = 0;
1718 if (flags & SCF_DO_SUBSTR) {
1719 data->pos_min += min1;
1720 data->pos_delta += max1 - min1;
1721 if (max1 != min1 || is_inf)
1722 data->longest = &(data->longest_float);
1723 }
1724 min += min1;
1725 delta += max1 - min1;
653099ff 1726 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1727 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1728 if (min1) {
1729 cl_and(data->start_class, &and_with);
1730 flags &= ~SCF_DO_STCLASS;
1731 }
1732 }
1733 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1734 if (min1) {
1735 cl_and(data->start_class, &accum);
653099ff 1736 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1737 }
1738 else {
b81d288d 1739 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1740 * data->start_class */
1741 StructCopy(data->start_class, &and_with,
1742 struct regnode_charclass_class);
1743 flags &= ~SCF_DO_STCLASS_AND;
1744 StructCopy(&accum, data->start_class,
1745 struct regnode_charclass_class);
1746 flags |= SCF_DO_STCLASS_OR;
1747 data->start_class->flags |= ANYOF_EOS;
1748 }
653099ff 1749 }
a3621e74
YO
1750
1751 /* demq.
1752
1753 Assuming this was/is a branch we are dealing with: 'scan' now
1754 points at the item that follows the branch sequence, whatever
1755 it is. We now start at the beginning of the sequence and look
1756 for subsequences of
1757
1758 BRANCH->EXACT=>X
1759 BRANCH->EXACT=>X
1760
1761 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1762
1763 If we can find such a subseqence we need to turn the first
1764 element into a trie and then add the subsequent branch exact
1765 strings to the trie.
1766
1767 We have two cases
1768
1769 1. patterns where the whole set of branch can be converted to a trie,
1770
1771 2. patterns where only a subset of the alternations can be
1772 converted to a trie.
1773
1774 In case 1 we can replace the whole set with a single regop
1775 for the trie. In case 2 we need to keep the start and end
1776 branchs so
1777
1778 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1779 becomes BRANCH TRIE; BRANCH X;
1780
1781 Hypthetically when we know the regex isnt anchored we can
1782 turn a case 1 into a DFA and let it rip... Every time it finds a match
1783 it would just call its tail, no WHILEM/CURLY needed.
1784
1785 */
0111c4fd
RGS
1786 if (DO_TRIE) {
1787 if (!re_trie_maxbuff) {
1788 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1789 if (!SvIOK(re_trie_maxbuff))
1790 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1791 }
a3621e74
YO
1792 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1793 regnode *cur;
1794 regnode *first = (regnode *)NULL;
1795 regnode *last = (regnode *)NULL;
1796 regnode *tail = scan;
1797 U8 optype = 0;
1798 U32 count=0;
1799
1800#ifdef DEBUGGING
c445ea15 1801 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1802#endif
1803 /* var tail is used because there may be a TAIL
1804 regop in the way. Ie, the exacts will point to the
1805 thing following the TAIL, but the last branch will
1806 point at the TAIL. So we advance tail. If we
1807 have nested (?:) we may have to move through several
1808 tails.
1809 */
1810
1811 while ( OP( tail ) == TAIL ) {
1812 /* this is the TAIL generated by (?:) */
1813 tail = regnext( tail );
1814 }
1815
1816 DEBUG_OPTIMISE_r({
32fc9b6a 1817 regprop(RExC_rx, mysv, tail );
a3621e74 1818 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1819 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1820 (RExC_seen_evals) ? "[EVAL]" : ""
1821 );
1822 });
1823 /*
1824
1825 step through the branches, cur represents each
1826 branch, noper is the first thing to be matched
1827 as part of that branch and noper_next is the
1828 regnext() of that node. if noper is an EXACT
1829 and noper_next is the same as scan (our current
1830 position in the regex) then the EXACT branch is
1831 a possible optimization target. Once we have
1832 two or more consequetive such branches we can
1833 create a trie of the EXACT's contents and stich
1834 it in place. If the sequence represents all of
1835 the branches we eliminate the whole thing and
1836 replace it with a single TRIE. If it is a
1837 subsequence then we need to stitch it in. This
1838 means the first branch has to remain, and needs
1839 to be repointed at the item on the branch chain
1840 following the last branch optimized. This could
1841 be either a BRANCH, in which case the
1842 subsequence is internal, or it could be the
1843 item following the branch sequence in which
1844 case the subsequence is at the end.
1845
1846 */
1847
1848 /* dont use tail as the end marker for this traverse */
1849 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1850 regnode * const noper = NEXTOPER( cur );
1851 regnode * const noper_next = regnext( noper );
a3621e74 1852
a3621e74 1853 DEBUG_OPTIMISE_r({
32fc9b6a 1854 regprop(RExC_rx, mysv, cur);
a3621e74 1855 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1856 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74 1857
32fc9b6a 1858 regprop(RExC_rx, mysv, noper);
a3621e74 1859 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1860 SvPV_nolen_const(mysv));
a3621e74
YO
1861
1862 if ( noper_next ) {
32fc9b6a 1863 regprop(RExC_rx, mysv, noper_next );
a3621e74 1864 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1865 SvPV_nolen_const(mysv));
a3621e74
YO
1866 }
1867 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
ca0270c4 1868 (void*)first, (void*)last, (void*)cur );
a3621e74
YO
1869 });
1870 if ( ( first ? OP( noper ) == optype
1871 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1872 && noper_next == tail && count<U16_MAX)
1873 {
1874 count++;
1875 if ( !first ) {
1876 first = cur;
1877 optype = OP( noper );
1878 } else {
1879 DEBUG_OPTIMISE_r(
1880 if (!last ) {
32fc9b6a 1881 regprop(RExC_rx, mysv, first);
a3621e74 1882 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1883 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
32fc9b6a 1884 regprop(RExC_rx, mysv, NEXTOPER(first) );
a3621e74 1885 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1886 SvPV_nolen_const( mysv ) );
a3621e74
YO
1887 }
1888 );
1889 last = cur;
1890 DEBUG_OPTIMISE_r({
32fc9b6a 1891 regprop(RExC_rx, mysv, cur);
a3621e74 1892 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1893 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
32fc9b6a 1894 regprop(RExC_rx, mysv, noper );
a3621e74 1895 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1896 SvPV_nolen_const( mysv ) );
a3621e74
YO
1897 });
1898 }
1899 } else {
1900 if ( last ) {
1901 DEBUG_OPTIMISE_r(
1902 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1903 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1904 );
1905 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1906 }
1907 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1908 && noper_next == tail )
1909 {
1910 count = 1;
1911 first = cur;
1912 optype = OP( noper );
1913 } else {
1914 count = 0;
1915 first = NULL;
1916 optype = 0;
1917 }
1918 last = NULL;
1919 }
1920 }
1921 DEBUG_OPTIMISE_r({
32fc9b6a 1922 regprop(RExC_rx, mysv, cur);
a3621e74 1923 PerlIO_printf( Perl_debug_log,
e4584336 1924 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
ca0270c4 1925 " ", SvPV_nolen_const( mysv ), (void*)first, (void*)last, (void*)cur);
a3621e74
YO
1926
1927 });
1928 if ( last ) {
1929 DEBUG_OPTIMISE_r(
1930 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1931 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1932 );
1933 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1934 }
1935 }
1936 }
a0ed51b3 1937 }
a3621e74 1938 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1939 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1940 } else /* single branch is optimized. */
c277df42
IZ
1941 scan = NEXTOPER(scan);
1942 continue;
a0ed51b3
LW
1943 }
1944 else if (OP(scan) == EXACT) {
cd439c50 1945 I32 l = STR_LEN(scan);
c445ea15 1946 UV uc;
a0ed51b3 1947 if (UTF) {
a3b680e6 1948 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1949 l = utf8_length(s, s + l);
9041c2e3 1950 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1951 } else {
1952 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1953 }
1954 min += l;
c277df42 1955 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1956 /* The code below prefers earlier match for fixed
1957 offset, later match for variable offset. */
1958 if (data->last_end == -1) { /* Update the start info. */
1959 data->last_start_min = data->pos_min;
1960 data->last_start_max = is_inf
b81d288d 1961 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1962 }
cd439c50 1963 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
1964 if (UTF)
1965 SvUTF8_on(data->last_found);
0eda9292 1966 {
9a957fbc 1967 SV * const sv = data->last_found;
a28509cc 1968 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
1969 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1970 if (mg && mg->mg_len >= 0)
5e43f467
JH
1971 mg->mg_len += utf8_length((U8*)STRING(scan),
1972 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 1973 }
c277df42
IZ
1974 data->last_end = data->pos_min + l;
1975 data->pos_min += l; /* As in the first entry. */
1976 data->flags &= ~SF_BEFORE_EOL;
1977 }
653099ff
GS
1978 if (flags & SCF_DO_STCLASS_AND) {
1979 /* Check whether it is compatible with what we know already! */
1980 int compat = 1;
1981
1aa99e6b 1982 if (uc >= 0x100 ||
516a5887 1983 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 1984 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 1985 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 1986 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 1987 )
653099ff
GS
1988 compat = 0;
1989 ANYOF_CLASS_ZERO(data->start_class);
1990 ANYOF_BITMAP_ZERO(data->start_class);
1991 if (compat)
1aa99e6b 1992 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 1993 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
1994 if (uc < 0x100)
1995 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
1996 }
1997 else if (flags & SCF_DO_STCLASS_OR) {
1998 /* false positive possible if the class is case-folded */
1aa99e6b 1999 if (uc < 0x100)
9b877dbb
IH
2000 ANYOF_BITMAP_SET(data->start_class, uc);
2001 else
2002 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2003 data->start_class->flags &= ~ANYOF_EOS;
2004 cl_and(data->start_class, &and_with);
2005 }
2006 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2007 }
653099ff 2008 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2009 I32 l = STR_LEN(scan);
1aa99e6b 2010 UV uc = *((U8*)STRING(scan));
653099ff
GS
2011
2012 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2013 if (flags & SCF_DO_SUBSTR) {
2014 assert(data);
830247a4 2015 scan_commit(pRExC_state, data);
ecaa9b9c 2016 }
a0ed51b3 2017 if (UTF) {
6136c704 2018 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2019 l = utf8_length(s, s + l);
9041c2e3 2020 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2021 }
2022 min += l;
ecaa9b9c 2023 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2024 data->pos_min += l;
653099ff
GS
2025 if (flags & SCF_DO_STCLASS_AND) {
2026 /* Check whether it is compatible with what we know already! */
2027 int compat = 1;
2028
1aa99e6b 2029 if (uc >= 0x100 ||
516a5887 2030 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2031 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2032 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2033 compat = 0;
2034 ANYOF_CLASS_ZERO(data->start_class);
2035 ANYOF_BITMAP_ZERO(data->start_class);
2036 if (compat) {
1aa99e6b 2037 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2038 data->start_class->flags &= ~ANYOF_EOS;
2039 data->start_class->flags |= ANYOF_FOLD;
2040 if (OP(scan) == EXACTFL)
2041 data->start_class->flags |= ANYOF_LOCALE;
2042 }
2043 }
2044 else if (flags & SCF_DO_STCLASS_OR) {
2045 if (data->start_class->flags & ANYOF_FOLD) {
2046 /* false positive possible if the class is case-folded.
2047 Assume that the locale settings are the same... */
1aa99e6b
IH
2048 if (uc < 0x100)
2049 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2050 data->start_class->flags &= ~ANYOF_EOS;
2051 }
2052 cl_and(data->start_class, &and_with);
2053 }
2054 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2055 }
bfed75c6 2056 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2057 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2058 I32 f = flags, pos_before = 0;
d4c19fe8 2059 regnode * const oscan = scan;
653099ff
GS
2060 struct regnode_charclass_class this_class;
2061 struct regnode_charclass_class *oclass = NULL;
727f22e3 2062 I32 next_is_eval = 0;
653099ff 2063
22c35a8c 2064 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2065 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2066 scan = NEXTOPER(scan);
2067 goto finish;
2068 case PLUS:
653099ff 2069 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2070 next = NEXTOPER(scan);
653099ff 2071 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2072 mincount = 1;
2073 maxcount = REG_INFTY;
c277df42
IZ
2074 next = regnext(scan);
2075 scan = NEXTOPER(scan);
2076 goto do_curly;
2077 }
2078 }
2079 if (flags & SCF_DO_SUBSTR)
2080 data->pos_min++;
2081 min++;
2082 /* Fall through. */
2083 case STAR:
653099ff
GS
2084 if (flags & SCF_DO_STCLASS) {
2085 mincount = 0;
b81d288d 2086 maxcount = REG_INFTY;
653099ff
GS
2087 next = regnext(scan);
2088 scan = NEXTOPER(scan);
2089 goto do_curly;
2090 }
b81d288d 2091 is_inf = is_inf_internal = 1;
c277df42
IZ
2092 scan = regnext(scan);
2093 if (flags & SCF_DO_SUBSTR) {
830247a4 2094 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2095 data->longest = &(data->longest_float);
2096 }
2097 goto optimize_curly_tail;
2098 case CURLY:
b81d288d 2099 mincount = ARG1(scan);
c277df42
IZ
2100 maxcount = ARG2(scan);
2101 next = regnext(scan);
cb434fcc
IZ
2102 if (OP(scan) == CURLYX) {
2103 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2104 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2105 }
c277df42 2106 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2107 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2108 do_curly:
2109 if (flags & SCF_DO_SUBSTR) {
830247a4 2110 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2111 pos_before = data->pos_min;
2112 }
2113 if (data) {
2114 fl = data->flags;
2115 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2116 if (is_inf)
2117 data->flags |= SF_IS_INF;
2118 }
653099ff 2119 if (flags & SCF_DO_STCLASS) {
830247a4 2120 cl_init(pRExC_state, &this_class);
653099ff
GS
2121 oclass = data->start_class;
2122 data->start_class = &this_class;
2123 f |= SCF_DO_STCLASS_AND;
2124 f &= ~SCF_DO_STCLASS_OR;
2125 }
e1901655
IZ
2126 /* These are the cases when once a subexpression
2127 fails at a particular position, it cannot succeed
2128 even after backtracking at the enclosing scope.
b81d288d 2129
e1901655
IZ
2130 XXXX what if minimal match and we are at the
2131 initial run of {n,m}? */
2132 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2133 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2134
c277df42 2135 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2136 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2137 (mincount == 0
2138 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2139
2140 if (flags & SCF_DO_STCLASS)
2141 data->start_class = oclass;
2142 if (mincount == 0 || minnext == 0) {
2143 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2144 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2145 }
2146 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2147 /* Switch to OR mode: cache the old value of
653099ff
GS
2148 * data->start_class */
2149 StructCopy(data->start_class, &and_with,
2150 struct regnode_charclass_class);
2151 flags &= ~SCF_DO_STCLASS_AND;
2152 StructCopy(&this_class, data->start_class,
2153 struct regnode_charclass_class);
2154 flags |= SCF_DO_STCLASS_OR;
2155 data->start_class->flags |= ANYOF_EOS;
2156 }
2157 } else { /* Non-zero len */
2158 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2159 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2160 cl_and(data->start_class, &and_with);
2161 }
2162 else if (flags & SCF_DO_STCLASS_AND)
2163 cl_and(data->start_class, &this_class);
2164 flags &= ~SCF_DO_STCLASS;
2165 }
c277df42
IZ
2166 if (!scan) /* It was not CURLYX, but CURLY. */
2167 scan = next;
041457d9
DM
2168 if ( /* ? quantifier ok, except for (?{ ... }) */
2169 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2170 && (minnext == 0) && (deltanext == 0)
99799961 2171 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2172 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2173 && ckWARN(WARN_REGEXP))
b45f050a 2174 {
830247a4 2175 vWARN(RExC_parse,
b45f050a
JF
2176 "Quantifier unexpected on zero-length expression");
2177 }
2178
c277df42 2179 min += minnext * mincount;
b81d288d 2180 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2181 && (minnext + deltanext) > 0)
2182 || deltanext == I32_MAX);
aca2d497 2183 is_inf |= is_inf_internal;
c277df42
IZ
2184 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2185
2186 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2187 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2188 && data->flags & SF_IN_PAR
2189 && !(data->flags & SF_HAS_EVAL)
2190 && !deltanext && minnext == 1 ) {
2191 /* Try to optimize to CURLYN. */
2192 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2193 regnode * const nxt1 = nxt;
497b47a8
JH
2194#ifdef DEBUGGING
2195 regnode *nxt2;
2196#endif
c277df42
IZ
2197
2198 /* Skip open. */
2199 nxt = regnext(nxt);
bfed75c6 2200 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2201 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2202 && STR_LEN(nxt) == 1))
c277df42 2203 goto nogo;
497b47a8 2204#ifdef DEBUGGING
c277df42 2205 nxt2 = nxt;
497b47a8 2206#endif
c277df42 2207 nxt = regnext(nxt);
b81d288d 2208 if (OP(nxt) != CLOSE)
c277df42
IZ
2209 goto nogo;
2210 /* Now we know that nxt2 is the only contents: */
eb160463 2211 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2212 OP(oscan) = CURLYN;
2213 OP(nxt1) = NOTHING; /* was OPEN. */
2214#ifdef DEBUGGING
2215 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2216 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2217 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2218 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2219 OP(nxt + 1) = OPTIMIZED; /* was count. */
2220 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2221#endif
c277df42 2222 }
c277df42
IZ
2223 nogo:
2224
2225 /* Try optimization CURLYX => CURLYM. */
b81d288d 2226 if ( OP(oscan) == CURLYX && data
c277df42 2227 && !(data->flags & SF_HAS_PAR)
c277df42 2228 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2229 && !deltanext /* atom is fixed width */
2230 && minnext != 0 /* CURLYM can't handle zero width */
2231 ) {
c277df42
IZ
2232 /* XXXX How to optimize if data == 0? */
2233 /* Optimize to a simpler form. */
2234 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2235 regnode *nxt2;
2236
2237 OP(oscan) = CURLYM;
2238 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2239 && (OP(nxt2) != WHILEM))
c277df42
IZ
2240 nxt = nxt2;
2241 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2242 /* Need to optimize away parenths. */
2243 if (data->flags & SF_IN_PAR) {
2244 /* Set the parenth number. */
2245 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2246
b81d288d 2247 if (OP(nxt) != CLOSE)
b45f050a 2248 FAIL("Panic opt close");
eb160463 2249 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2250 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2251 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2252#ifdef DEBUGGING
2253 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2254 OP(nxt + 1) = OPTIMIZED; /* was count. */
2255 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2256 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2257#endif
c277df42
IZ
2258#if 0
2259 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2260 regnode *nnxt = regnext(nxt1);
b81d288d 2261
c277df42
IZ
2262 if (nnxt == nxt) {
2263 if (reg_off_by_arg[OP(nxt1)])
2264 ARG_SET(nxt1, nxt2 - nxt1);
2265 else if (nxt2 - nxt1 < U16_MAX)
2266 NEXT_OFF(nxt1) = nxt2 - nxt1;
2267 else
2268 OP(nxt) = NOTHING; /* Cannot beautify */
2269 }
2270 nxt1 = nnxt;
2271 }
2272#endif
2273 /* Optimize again: */
b81d288d 2274 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2275 NULL, 0,depth+1);
a0ed51b3
LW
2276 }
2277 else
c277df42 2278 oscan->flags = 0;
c277df42 2279 }
e1901655
IZ
2280 else if ((OP(oscan) == CURLYX)
2281 && (flags & SCF_WHILEM_VISITED_POS)
2282 /* See the comment on a similar expression above.
2283 However, this time it not a subexpression
2284 we care about, but the expression itself. */
2285 && (maxcount == REG_INFTY)
2286 && data && ++data->whilem_c < 16) {
2287 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2288 /* Find WHILEM (as in regexec.c) */
2289 regnode *nxt = oscan + NEXT_OFF(oscan);
2290
2291 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2292 nxt += ARG(nxt);
eb160463
GS
2293 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2294 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2295 }
b81d288d 2296 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2297 pars++;
2298 if (flags & SCF_DO_SUBSTR) {
c445ea15 2299 SV *last_str = NULL;
c277df42
IZ
2300 int counted = mincount != 0;
2301
2302 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2303#if defined(SPARC64_GCC_WORKAROUND)
2304 I32 b = 0;
2305 STRLEN l = 0;
cfd0369c 2306 const char *s = NULL;
5d1c421c
JH
2307 I32 old = 0;
2308
2309 if (pos_before >= data->last_start_min)
2310 b = pos_before;
2311 else
2312 b = data->last_start_min;
2313
2314 l = 0;
cfd0369c 2315 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2316 old = b - data->last_start_min;
2317
2318#else
b81d288d 2319 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2320 ? pos_before : data->last_start_min;
2321 STRLEN l;
d4c19fe8 2322 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2323 I32 old = b - data->last_start_min;
5d1c421c 2324#endif
a0ed51b3
LW
2325
2326 if (UTF)
2327 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2328
a0ed51b3 2329 l -= old;
c277df42 2330 /* Get the added string: */
79cb57f6 2331 last_str = newSVpvn(s + old, l);
0e933229
IH
2332 if (UTF)
2333 SvUTF8_on(last_str);
c277df42
IZ
2334 if (deltanext == 0 && pos_before == b) {
2335 /* What was added is a constant string */
2336 if (mincount > 1) {
2337 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2338 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2339 SvPVX_const(last_str), l, mincount - 1);
b162af07 2340 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2341 /* Add additional parts. */
b81d288d 2342 SvCUR_set(data->last_found,
c277df42
IZ
2343 SvCUR(data->last_found) - l);
2344 sv_catsv(data->last_found, last_str);
0eda9292
JH
2345 {
2346 SV * sv = data->last_found;
2347 MAGIC *mg =
2348 SvUTF8(sv) && SvMAGICAL(sv) ?
2349 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2350 if (mg && mg->mg_len >= 0)
2351 mg->mg_len += CHR_SVLEN(last_str);
2352 }
c277df42
IZ
2353 data->last_end += l * (mincount - 1);
2354 }
2a8d9689
HS
2355 } else {
2356 /* start offset must point into the last copy */
2357 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2358 data->last_start_max += is_inf ? I32_MAX
2359 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2360 }
2361 }
2362 /* It is counted once already... */
2363 data->pos_min += minnext * (mincount - counted);
2364 data->pos_delta += - counted * deltanext +
2365 (minnext + deltanext) * maxcount - minnext * mincount;
2366 if (mincount != maxcount) {
653099ff
GS
2367 /* Cannot extend fixed substrings found inside
2368 the group. */
830247a4 2369 scan_commit(pRExC_state,data);
c277df42 2370 if (mincount && last_str) {
d4c19fe8
AL
2371 SV * const sv = data->last_found;
2372 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2373 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2374
2375 if (mg)
2376 mg->mg_len = -1;
2377 sv_setsv(sv, last_str);
c277df42 2378 data->last_end = data->pos_min;
b81d288d 2379 data->last_start_min =
a0ed51b3 2380 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2381 data->last_start_max = is_inf
2382 ? I32_MAX
c277df42 2383 : data->pos_min + data->pos_delta
a0ed51b3 2384 - CHR_SVLEN(last_str);
c277df42
IZ
2385 }
2386 data->longest = &(data->longest_float);
2387 }
aca2d497 2388 SvREFCNT_dec(last_str);
c277df42 2389 }
405ff068 2390 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2391 data->flags |= SF_HAS_EVAL;
2392 optimize_curly_tail:
c277df42 2393 if (OP(oscan) != CURLYX) {
22c35a8c 2394 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2395 && NEXT_OFF(next))
2396 NEXT_OFF(oscan) += NEXT_OFF(next);
2397 }
c277df42 2398 continue;
653099ff 2399 default: /* REF and CLUMP only? */
c277df42 2400 if (flags & SCF_DO_SUBSTR) {
830247a4 2401 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2402 data->longest = &(data->longest_float);
2403 }
aca2d497 2404 is_inf = is_inf_internal = 1;
653099ff 2405 if (flags & SCF_DO_STCLASS_OR)
830247a4 2406 cl_anything(pRExC_state, data->start_class);
653099ff 2407 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2408 break;
2409 }
a0ed51b3 2410 }
bfed75c6 2411 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2412 int value = 0;
653099ff 2413
c277df42 2414 if (flags & SCF_DO_SUBSTR) {
830247a4 2415 scan_commit(pRExC_state,data);
c277df42
IZ
2416 data->pos_min++;
2417 }
2418 min++;
653099ff
GS
2419 if (flags & SCF_DO_STCLASS) {
2420 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2421
2422 /* Some of the logic below assumes that switching
2423 locale on will only add false positives. */
2424 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2425 case SANY:
653099ff
GS
2426 default:
2427 do_default:
2428 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2429 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2430 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2431 break;
2432 case REG_ANY:
2433 if (OP(scan) == SANY)
2434 goto do_default;
2435 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2436 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2437 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2438 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2439 }
2440 if (flags & SCF_DO_STCLASS_AND || !value)
2441 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2442 break;
2443 case ANYOF:
2444 if (flags & SCF_DO_STCLASS_AND)
2445 cl_and(data->start_class,
2446 (struct regnode_charclass_class*)scan);
2447 else
830247a4 2448 cl_or(pRExC_state, data->start_class,
653099ff
GS
2449 (struct regnode_charclass_class*)scan);
2450 break;
2451 case ALNUM:
2452 if (flags & SCF_DO_STCLASS_AND) {
2453 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2454 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2455 for (value = 0; value < 256; value++)
2456 if (!isALNUM(value))
2457 ANYOF_BITMAP_CLEAR(data->start_class, value);
2458 }
2459 }
2460 else {
2461 if (data->start_class->flags & ANYOF_LOCALE)
2462 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2463 else {
2464 for (value = 0; value < 256; value++)
2465 if (isALNUM(value))
b81d288d 2466 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2467 }
2468 }
2469 break;
2470 case ALNUML:
2471 if (flags & SCF_DO_STCLASS_AND) {
2472 if (data->start_class->flags & ANYOF_LOCALE)
2473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2474 }
2475 else {
2476 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2477 data->start_class->flags |= ANYOF_LOCALE;
2478 }
2479 break;
2480 case NALNUM:
2481 if (flags & SCF_DO_STCLASS_AND) {
2482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2484 for (value = 0; value < 256; value++)
2485 if (isALNUM(value))
2486 ANYOF_BITMAP_CLEAR(data->start_class, value);
2487 }
2488 }
2489 else {
2490 if (data->start_class->flags & ANYOF_LOCALE)
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2492 else {
2493 for (value = 0; value < 256; value++)
2494 if (!isALNUM(value))
b81d288d 2495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2496 }
2497 }
2498 break;
2499 case NALNUML:
2500 if (flags & SCF_DO_STCLASS_AND) {
2501 if (data->start_class->flags & ANYOF_LOCALE)
2502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2503 }
2504 else {
2505 data->start_class->flags |= ANYOF_LOCALE;
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2507 }
2508 break;
2509 case SPACE:
2510 if (flags & SCF_DO_STCLASS_AND) {
2511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2513 for (value = 0; value < 256; value++)
2514 if (!isSPACE(value))
2515 ANYOF_BITMAP_CLEAR(data->start_class, value);
2516 }
2517 }
2518 else {
2519 if (data->start_class->flags & ANYOF_LOCALE)
2520 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2521 else {
2522 for (value = 0; value < 256; value++)
2523 if (isSPACE(value))
b81d288d 2524 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2525 }
2526 }
2527 break;
2528 case SPACEL:
2529 if (flags & SCF_DO_STCLASS_AND) {
2530 if (data->start_class->flags & ANYOF_LOCALE)
2531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2532 }
2533 else {
2534 data->start_class->flags |= ANYOF_LOCALE;
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2536 }
2537 break;
2538 case NSPACE:
2539 if (flags & SCF_DO_STCLASS_AND) {
2540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2542 for (value = 0; value < 256; value++)
2543 if (isSPACE(value))
2544 ANYOF_BITMAP_CLEAR(data->start_class, value);
2545 }
2546 }
2547 else {
2548 if (data->start_class->flags & ANYOF_LOCALE)
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2550 else {
2551 for (value = 0; value < 256; value++)
2552 if (!isSPACE(value))
b81d288d 2553 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2554 }
2555 }
2556 break;
2557 case NSPACEL:
2558 if (flags & SCF_DO_STCLASS_AND) {
2559 if (data->start_class->flags & ANYOF_LOCALE) {
2560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2561 for (value = 0; value < 256; value++)
2562 if (!isSPACE(value))
2563 ANYOF_BITMAP_CLEAR(data->start_class, value);
2564 }
2565 }
2566 else {
2567 data->start_class->flags |= ANYOF_LOCALE;
2568 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2569 }
2570 break;
2571 case DIGIT:
2572 if (flags & SCF_DO_STCLASS_AND) {
2573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2574 for (value = 0; value < 256; value++)
2575 if (!isDIGIT(value))
2576 ANYOF_BITMAP_CLEAR(data->start_class, value);
2577 }
2578 else {
2579 if (data->start_class->flags & ANYOF_LOCALE)
2580 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2581 else {
2582 for (value = 0; value < 256; value++)
2583 if (isDIGIT(value))
b81d288d 2584 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2585 }
2586 }
2587 break;
2588 case NDIGIT:
2589 if (flags & SCF_DO_STCLASS_AND) {
2590 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2591 for (value = 0; value < 256; value++)
2592 if (isDIGIT(value))
2593 ANYOF_BITMAP_CLEAR(data->start_class, value);
2594 }
2595 else {
2596 if (data->start_class->flags & ANYOF_LOCALE)
2597 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2598 else {
2599 for (value = 0; value < 256; value++)
2600 if (!isDIGIT(value))
b81d288d 2601 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2602 }
2603 }
2604 break;
2605 }
2606 if (flags & SCF_DO_STCLASS_OR)
2607 cl_and(data->start_class, &and_with);
2608 flags &= ~SCF_DO_STCLASS;
2609 }
a0ed51b3 2610 }
22c35a8c 2611 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2612 data->flags |= (OP(scan) == MEOL
2613 ? SF_BEFORE_MEOL
2614 : SF_BEFORE_SEOL);
a0ed51b3 2615 }
653099ff
GS
2616 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2617 /* Lookbehind, or need to calculate parens/evals/stclass: */
2618 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2619 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2620 /* Lookahead/lookbehind */
cb434fcc 2621 I32 deltanext, minnext, fake = 0;
c277df42 2622 regnode *nscan;
653099ff
GS
2623 struct regnode_charclass_class intrnl;
2624 int f = 0;
c277df42
IZ
2625
2626 data_fake.flags = 0;
b81d288d 2627 if (data) {
2c2d71f5 2628 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2629 data_fake.last_closep = data->last_closep;
2630 }
2631 else
2632 data_fake.last_closep = &fake;
653099ff
GS
2633 if ( flags & SCF_DO_STCLASS && !scan->flags
2634 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2635 cl_init(pRExC_state, &intrnl);
653099ff 2636 data_fake.start_class = &intrnl;
e1901655 2637 f |= SCF_DO_STCLASS_AND;
653099ff 2638 }
e1901655
IZ
2639 if (flags & SCF_WHILEM_VISITED_POS)
2640 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2641 next = regnext(scan);
2642 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2643 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2644 if (scan->flags) {
2645 if (deltanext) {
9baa0206 2646 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2647 }
2648 else if (minnext > U8_MAX) {
9baa0206 2649 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2650 }
eb160463 2651 scan->flags = (U8)minnext;
c277df42
IZ
2652 }
2653 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2654 pars++;
405ff068 2655 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2656 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2657 if (data)
2658 data->whilem_c = data_fake.whilem_c;
e1901655 2659 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2660 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2661
2662 cl_and(data->start_class, &intrnl);
2663 if (was)
2664 data->start_class->flags |= ANYOF_EOS;
2665 }
a0ed51b3
LW
2666 }
2667 else if (OP(scan) == OPEN) {
c277df42 2668 pars++;
a0ed51b3 2669 }
cb434fcc 2670 else if (OP(scan) == CLOSE) {
eb160463 2671 if ((I32)ARG(scan) == is_par) {
cb434fcc 2672 next = regnext(scan);
c277df42 2673
cb434fcc
IZ
2674 if ( next && (OP(next) != WHILEM) && next < last)
2675 is_par = 0; /* Disable optimization */
2676 }
2677 if (data)
2678 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2679 }
2680 else if (OP(scan) == EVAL) {
c277df42
IZ
2681 if (data)
2682 data->flags |= SF_HAS_EVAL;
2683 }
96776eda 2684 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2685 if (flags & SCF_DO_SUBSTR) {
830247a4 2686 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2687 data->longest = &(data->longest_float);
2688 }
2689 is_inf = is_inf_internal = 1;
653099ff 2690 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2691 cl_anything(pRExC_state, data->start_class);
96776eda 2692 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2693 }
c277df42
IZ
2694 /* Else: zero-length, ignore. */
2695 scan = regnext(scan);
2696 }
2697
2698 finish:
2699 *scanp = scan;
aca2d497 2700 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2701 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2702 data->pos_delta = I32_MAX - data->pos_min;
2703 if (is_par > U8_MAX)
2704 is_par = 0;
2705 if (is_par && pars==1 && data) {
2706 data->flags |= SF_IN_PAR;
2707 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2708 }
2709 else if (pars && data) {
c277df42
IZ
2710 data->flags |= SF_HAS_PAR;
2711 data->flags &= ~SF_IN_PAR;
2712 }
653099ff
GS
2713 if (flags & SCF_DO_STCLASS_OR)
2714 cl_and(data->start_class, &and_with);
c277df42
IZ
2715 return min;
2716}
2717
76e3520e 2718STATIC I32
5f66b61c 2719S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2720{
830247a4 2721 if (RExC_rx->data) {
b81d288d
AB
2722 Renewc(RExC_rx->data,
2723 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2724 char, struct reg_data);
830247a4
IZ
2725 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2726 RExC_rx->data->count += n;
a0ed51b3
LW
2727 }
2728 else {
a02a5408 2729 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2730 char, struct reg_data);
a02a5408 2731 Newx(RExC_rx->data->what, n, U8);
830247a4 2732 RExC_rx->data->count = n;
c277df42 2733 }
830247a4
IZ
2734 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2735 return RExC_rx->data->count - n;
c277df42
IZ
2736}
2737
76234dfb 2738#ifndef PERL_IN_XSUB_RE
d88dccdf 2739void
864dbfa3 2740Perl_reginitcolors(pTHX)
d88dccdf 2741{
97aff369 2742 dVAR;
1df70142 2743 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2744 if (s) {
1df70142
AL
2745 char *t = savepv(s);
2746 int i = 0;
2747 PL_colors[0] = t;
d88dccdf 2748 while (++i < 6) {
1df70142
AL
2749 t = strchr(t, '\t');
2750 if (t) {
2751 *t = '\0';
2752 PL_colors[i] = ++t;
d88dccdf
IZ
2753 }
2754 else
1df70142 2755 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2756 }
2757 } else {
1df70142 2758 int i = 0;
b81d288d 2759 while (i < 6)
06b5626a 2760 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2761 }
2762 PL_colorset = 1;
2763}
76234dfb 2764#endif
8615cb43 2765
a687059c 2766/*
e50aee73 2767 - pregcomp - compile a regular expression into internal code
a687059c
LW
2768 *
2769 * We can't allocate space until we know how big the compiled form will be,
2770 * but we can't compile it (and thus know how big it is) until we've got a
2771 * place to put the code. So we cheat: we compile it twice, once with code
2772 * generation turned off and size counting turned on, and once "for real".
2773 * This also means that we don't allocate space until we are sure that the
2774 * thing really will compile successfully, and we never have to move the
2775 * code and thus invalidate pointers into it. (Note that it has to be in
2776 * one piece because free() must be able to free it all.) [NB: not true in perl]
2777 *
2778 * Beware that the optimization-preparation code in here knows about some
2779 * of the structure of the compiled regexp. [I'll say.]
2780 */
2781regexp *
864dbfa3 2782Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2783{
97aff369 2784 dVAR;
a0d0e21e 2785 register regexp *r;
c277df42 2786 regnode *scan;
c277df42 2787 regnode *first;
a0d0e21e 2788 I32 flags;
a0d0e21e
LW
2789 I32 minlen = 0;
2790 I32 sawplus = 0;
2791 I32 sawopen = 0;
2c2d71f5 2792 scan_data_t data;
830247a4
IZ
2793 RExC_state_t RExC_state;
2794 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2795
a3621e74
YO
2796 GET_RE_DEBUG_FLAGS_DECL;
2797
a0d0e21e 2798 if (exp == NULL)
c277df42 2799 FAIL("NULL regexp argument");
a0d0e21e 2800
a5961de5 2801 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2802
5cfc7842 2803 RExC_precomp = exp;
a3621e74
YO
2804 DEBUG_r(if (!PL_colorset) reginitcolors());
2805 DEBUG_COMPILE_r({
2806 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2807 PL_colors[4],PL_colors[5],PL_colors[0],
2808 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2809 });
e2509266 2810 RExC_flags = pm->op_pmflags;
830247a4 2811 RExC_sawback = 0;
bbce6d69 2812
830247a4
IZ
2813 RExC_seen = 0;
2814 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2815 RExC_seen_evals = 0;
2816 RExC_extralen = 0;
c277df42 2817
bbce6d69 2818 /* First pass: determine size, legality. */
830247a4 2819 RExC_parse = exp;
fac92740 2820 RExC_start = exp;
830247a4
IZ
2821 RExC_end = xend;
2822 RExC_naughty = 0;
2823 RExC_npar = 1;
2824 RExC_size = 0L;
2825 RExC_emit = &PL_regdummy;
2826 RExC_whilem_seen = 0;
85ddcde9
JH
2827#if 0 /* REGC() is (currently) a NOP at the first pass.
2828 * Clever compilers notice this and complain. --jhi */
830247a4 2829 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2830#endif
830247a4 2831 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2832 RExC_precomp = NULL;
a0d0e21e
LW
2833 return(NULL);
2834 }
a3621e74 2835 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2836
c277df42
IZ
2837 /* Small enough for pointer-storage convention?
2838 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2839 if (RExC_size >= 0x10000L && RExC_extralen)
2840 RExC_size += RExC_extralen;
c277df42 2841 else
830247a4
IZ
2842 RExC_extralen = 0;
2843 if (RExC_whilem_seen > 15)
2844 RExC_whilem_seen = 15;
a0d0e21e 2845
bbce6d69 2846 /* Allocate space and initialize. */
a02a5408 2847 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2848 char, regexp);
a0d0e21e 2849 if (r == NULL)
b45f050a
JF
2850 FAIL("Regexp out of space");
2851
0f79a09d
GS
2852#ifdef DEBUGGING
2853 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2854 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2855#endif
c277df42 2856 r->refcnt = 1;
bbce6d69 2857 r->prelen = xend - exp;
5cfc7842 2858 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2859 r->subbeg = NULL;
f8c7b90f 2860#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2861 r->saved_copy = NULL;
ed252734 2862#endif
cf93c79d 2863 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2864 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 2865 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
2866
2867 r->substrs = 0; /* Useful during FAIL. */
2868 r->startp = 0; /* Useful during FAIL. */
2869 r->endp = 0; /* Useful during FAIL. */
2870
a02a5408 2871 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2872 if (r->offsets) {
2af232bd 2873 r->offsets[0] = RExC_size;
fac92740 2874 }
a3621e74 2875 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2876 "%s %"UVuf" bytes for offset annotations.\n",
2877 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2878 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2879
830247a4 2880 RExC_rx = r;
bbce6d69 2881
2882 /* Second pass: emit code. */
e2509266 2883 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2884 RExC_parse = exp;
2885 RExC_end = xend;
2886 RExC_naughty = 0;
2887 RExC_npar = 1;
fac92740 2888 RExC_emit_start = r->program;
830247a4 2889 RExC_emit = r->program;
2cd61cdb 2890 /* Store the count of eval-groups for security checks: */
eb160463 2891 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2892 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2893 r->data = 0;
830247a4 2894 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2895 return(NULL);
2896
a3621e74 2897
a0d0e21e 2898 /* Dig out information for optimizations. */
cf93c79d 2899 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2900 pm->op_pmflags = RExC_flags;
a0ed51b3 2901 if (UTF)
5ff6fc6d 2902 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2903 r->regstclass = NULL;
830247a4 2904 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2905 r->reganch |= ROPT_NAUGHTY;
c277df42 2906 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2907
2908 /* XXXX To minimize changes to RE engine we always allocate
2909 3-units-long substrs field. */
a02a5408 2910 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2911
2c2d71f5 2912 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2913 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2914 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2915 I32 fake;
c5254dd6 2916 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2917 struct regnode_charclass_class ch_class;
2918 int stclass_flag;
cb434fcc 2919 I32 last_close = 0;
a0d0e21e
LW
2920
2921 first = scan;
c277df42 2922 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2923 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2924 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2925 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2926 (OP(first) == PLUS) ||
2927 (OP(first) == MINMOD) ||
653099ff 2928 /* An {n,m} with n>0 */
22c35a8c 2929 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2930 if (OP(first) == PLUS)
2931 sawplus = 1;
2932 else
2933 first += regarglen[(U8)OP(first)];
2934 first = NEXTOPER(first);
a687059c
LW
2935 }
2936
a0d0e21e
LW
2937 /* Starting-point info. */
2938 again:
653099ff 2939 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b 2940 if (OP(first) == EXACT)
6f207bd3 2941 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 2942 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2943 r->regstclass = first;
2944 }
bfed75c6 2945 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2946 r->regstclass = first;
22c35a8c
GS
2947 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2948 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2949 r->regstclass = first;
22c35a8c 2950 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2951 r->reganch |= (OP(first) == MBOL
2952 ? ROPT_ANCH_MBOL
2953 : (OP(first) == SBOL
2954 ? ROPT_ANCH_SBOL
2955 : ROPT_ANCH_BOL));
a0d0e21e 2956 first = NEXTOPER(first);
774d564b 2957 goto again;
2958 }
2959 else if (OP(first) == GPOS) {
2960 r->reganch |= ROPT_ANCH_GPOS;
2961 first = NEXTOPER(first);
2962 goto again;
a0d0e21e 2963 }
e09294f4 2964 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2965 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2966 !(r->reganch & ROPT_ANCH) )
2967 {
2968 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2969 const int type =
2970 (OP(NEXTOPER(first)) == REG_ANY)
2971 ? ROPT_ANCH_MBOL
2972 : ROPT_ANCH_SBOL;
cad2e5aa 2973 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2974 first = NEXTOPER(first);
774d564b 2975 goto again;
a0d0e21e 2976 }
b81d288d 2977 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 2978 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
2979 /* x+ must match at the 1st pos of run of x's */
2980 r->reganch |= ROPT_SKIP;
a0d0e21e 2981
c277df42 2982 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 2983 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 2984 (IV)(first - scan + 1)));
a0d0e21e
LW
2985 /*
2986 * If there's something expensive in the r.e., find the
2987 * longest literal string that must appear and make it the
2988 * regmust. Resolve ties in favor of later strings, since
2989 * the regstart check works with the beginning of the r.e.
2990 * and avoiding duplication strengthens checking. Not a
2991 * strong reason, but sufficient in the absence of others.
2992 * [Now we resolve ties in favor of the earlier string if
c277df42 2993 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
2994 * earlier string may buy us something the later one won't.]
2995 */
a0d0e21e 2996 minlen = 0;
a687059c 2997
396482e1
GA
2998 data.longest_fixed = newSVpvs("");
2999 data.longest_float = newSVpvs("");
3000 data.last_found = newSVpvs("");
c277df42
IZ
3001 data.longest = &(data.longest_fixed);
3002 first = scan;
653099ff 3003 if (!r->regstclass) {
830247a4 3004 cl_init(pRExC_state, &ch_class);
653099ff
GS
3005 data.start_class = &ch_class;
3006 stclass_flag = SCF_DO_STCLASS_AND;
3007 } else /* XXXX Check for BOUND? */
3008 stclass_flag = 0;
cb434fcc 3009 data.last_closep = &last_close;
653099ff 3010
830247a4 3011 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3012 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3013 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3014 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3015 && !RExC_seen_zerolen
3016 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3017 r->reganch |= ROPT_CHECK_ALL;
830247a4 3018 scan_commit(pRExC_state, &data);
c277df42
IZ
3019 SvREFCNT_dec(data.last_found);
3020
a0ed51b3 3021 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3022 if (longest_float_length
c277df42
IZ
3023 || (data.flags & SF_FL_BEFORE_EOL
3024 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3025 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3026 int t;
3027
a0ed51b3 3028 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3029 && data.offset_fixed == data.offset_float_min
3030 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3031 goto remove_float; /* As in (a)+. */
3032
33b8afdf
JH
3033 if (SvUTF8(data.longest_float)) {
3034 r->float_utf8 = data.longest_float;
c445ea15 3035 r->float_substr = NULL;
33b8afdf
JH
3036 } else {
3037 r->float_substr = data.longest_float;
c445ea15 3038 r->float_utf8 = NULL;
33b8afdf 3039 }
c277df42
IZ
3040 r->float_min_offset = data.offset_float_min;
3041 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3042 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3043 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3044 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3045 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3046 }
3047 else {
aca2d497 3048 remove_float:
c445ea15 3049 r->float_substr = r->float_utf8 = NULL;
c277df42 3050 SvREFCNT_dec(data.longest_float);
c5254dd6 3051 longest_float_length = 0;
a0d0e21e 3052 }
c277df42 3053
a0ed51b3 3054 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3055 if (longest_fixed_length
c277df42
IZ
3056 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3057 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3058 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3059 int t;
3060
33b8afdf
JH
3061 if (SvUTF8(data.longest_fixed)) {
3062 r->anchored_utf8 = data.longest_fixed;
c445ea15 3063 r->anchored_substr = NULL;
33b8afdf
JH
3064 } else {
3065 r->anchored_substr = data.longest_fixed;
c445ea15 3066 r->anchored_utf8 = NULL;
33b8afdf 3067 }
c277df42 3068 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3069 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3070 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3071 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3072 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3073 }
3074 else {
c445ea15 3075 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3076 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3077 longest_fixed_length = 0;
a0d0e21e 3078 }
b81d288d 3079 if (r->regstclass
ffc61ed2 3080 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3081 r->regstclass = NULL;
33b8afdf
JH
3082 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3083 && stclass_flag
653099ff 3084 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3085 && !cl_is_anything(data.start_class))
3086 {
1df70142 3087 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3088
a02a5408 3089 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3090 struct regnode_charclass_class);
3091 StructCopy(data.start_class,
830247a4 3092 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3093 struct regnode_charclass_class);
830247a4 3094 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3095 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3096 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3097 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3098 PerlIO_printf(Perl_debug_log,
a0288114 3099 "synthetic stclass \"%s\".\n",
3f7c398e 3100 SvPVX_const(sv));});
653099ff 3101 }
c277df42
IZ
3102
3103 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3104 if (longest_fixed_length > longest_float_length) {
c277df42 3105 r->check_substr = r->anchored_substr;
33b8afdf 3106 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3107 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3108 if (r->reganch & ROPT_ANCH_SINGLE)
3109 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3110 }
3111 else {
c277df42 3112 r->check_substr = r->float_substr;
33b8afdf 3113 r->check_utf8 = r->float_utf8;
c277df42
IZ
3114 r->check_offset_min = data.offset_float_min;
3115 r->check_offset_max = data.offset_float_max;
a0d0e21e 3116 }
30382c73
IZ
3117 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3118 This should be changed ASAP! */
33b8afdf 3119 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3120 r->reganch |= RE_USE_INTUIT;
33b8afdf 3121 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3122 r->reganch |= RE_INTUIT_TAIL;
3123 }
a0ed51b3
LW
3124 }
3125 else {
c277df42
IZ
3126 /* Several toplevels. Best we can is to set minlen. */
3127 I32 fake;
653099ff 3128 struct regnode_charclass_class ch_class;
cb434fcc 3129 I32 last_close = 0;
c277df42 3130
a3621e74 3131 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3132 scan = r->program + 1;
830247a4 3133 cl_init(pRExC_state, &ch_class);
653099ff 3134 data.start_class = &ch_class;
cb434fcc 3135 data.last_closep = &last_close;
a3621e74 3136 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3137 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3138 = r->float_substr = r->float_utf8 = NULL;
653099ff 3139 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3140 && !cl_is_anything(data.start_class))
3141 {
1df70142 3142 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3143
a02a5408 3144 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3145 struct regnode_charclass_class);
3146 StructCopy(data.start_class,
830247a4 3147 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3148 struct regnode_charclass_class);
830247a4 3149 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3150 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3151 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3152 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3153 PerlIO_printf(Perl_debug_log,
a0288114 3154 "synthetic stclass \"%s\".\n",
3f7c398e 3155 SvPVX_const(sv));});
653099ff 3156 }
a0d0e21e
LW
3157 }
3158
a0d0e21e 3159 r->minlen = minlen;
b81d288d 3160 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3161 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3162 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3163 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3164 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3165 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3166 if (RExC_seen & REG_SEEN_CANY)
3167 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3168 Newxz(r->startp, RExC_npar, I32);
3169 Newxz(r->endp, RExC_npar, I32);
a3621e74 3170 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3171 return(r);
a687059c
LW
3172}
3173
3174/*
3175 - reg - regular expression, i.e. main body or parenthesized thing
3176 *
3177 * Caller must absorb opening parenthesis.
3178 *
3179 * Combining parenthesis handling with the base level of regular expression
3180 * is a trifle forced, but the need to tie the tails of the branches to what
3181 * follows makes it hard to avoid.
3182 */
76e3520e 3183STATIC regnode *
830247a4 3184S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3185 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3186{
27da23d5 3187 dVAR;
c277df42
IZ
3188 register regnode *ret; /* Will be the head of the group. */
3189 register regnode *br;
3190 register regnode *lastbr;
cbbf8932 3191 register regnode *ender = NULL;
a0d0e21e 3192 register I32 parno = 0;
cbbf8932
AL
3193 I32 flags;
3194 const I32 oregflags = RExC_flags;
6136c704
AL
3195 bool have_branch = 0;
3196 bool is_open = 0;
9d1d55b5
JP
3197
3198 /* for (?g), (?gc), and (?o) warnings; warning
3199 about (?c) will warn about (?g) -- japhy */
3200
6136c704
AL
3201#define WASTED_O 0x01
3202#define WASTED_G 0x02
3203#define WASTED_C 0x04
3204#define WASTED_GC (0x02|0x04)
cbbf8932 3205 I32 wastedflags = 0x00;
9d1d55b5 3206
fac92740 3207 char * parse_start = RExC_parse; /* MJD */
a28509cc 3208 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3209
821b33a5 3210 *flagp = 0; /* Tentatively. */
a0d0e21e 3211
9d1d55b5 3212
a0d0e21e
LW
3213 /* Make an OPEN node, if parenthesized. */
3214 if (paren) {
fac92740 3215 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3216 U32 posflags = 0, negflags = 0;
3217 U32 *flagsp = &posflags;
6136c704 3218 bool is_logical = 0;
a28509cc 3219 const char * const seqstart = RExC_parse;
ca9dfc88 3220
830247a4
IZ
3221 RExC_parse++;
3222 paren = *RExC_parse++;
c277df42 3223 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3224 switch (paren) {
fac92740 3225 case '<': /* (?<...) */
830247a4 3226 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3227 if (*RExC_parse == '!')
c277df42 3228 paren = ',';
b81d288d 3229 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3230 goto unknown;
830247a4 3231 RExC_parse++;
fac92740
MJD
3232 case '=': /* (?=...) */
3233 case '!': /* (?!...) */
830247a4 3234 RExC_seen_zerolen++;
fac92740
MJD
3235 case ':': /* (?:...) */
3236 case '>': /* (?>...) */
a0d0e21e 3237 break;
fac92740
MJD
3238 case '$': /* (?$...) */
3239 case '@': /* (?@...) */
8615cb43 3240 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3241 break;
fac92740 3242 case '#': /* (?#...) */
830247a4
IZ
3243 while (*RExC_parse && *RExC_parse != ')')
3244 RExC_parse++;
3245 if (*RExC_parse != ')')
c277df42 3246 FAIL("Sequence (?#... not terminated");
830247a4 3247 nextchar(pRExC_state);
a0d0e21e
LW
3248 *flagp = TRYAGAIN;
3249 return NULL;
fac92740 3250 case 'p': /* (?p...) */
9014280d 3251 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3252 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3253 /* FALL THROUGH*/
fac92740 3254 case '?': /* (??...) */
6136c704 3255 is_logical = 1;
438a3801
YST
3256 if (*RExC_parse != '{')
3257 goto unknown;
830247a4 3258 paren = *RExC_parse++;
0f5d15d6 3259 /* FALL THROUGH */
fac92740 3260 case '{': /* (?{...}) */
c277df42 3261 {
c277df42
IZ
3262 I32 count = 1, n = 0;
3263 char c;
830247a4 3264 char *s = RExC_parse;
c277df42 3265
830247a4
IZ
3266 RExC_seen_zerolen++;
3267 RExC_seen |= REG_SEEN_EVAL;
3268 while (count && (c = *RExC_parse)) {
6136c704
AL
3269 if (c == '\\') {
3270 if (RExC_parse[1])
3271 RExC_parse++;
3272 }
b81d288d 3273 else if (c == '{')
c277df42 3274 count++;
b81d288d 3275 else if (c == '}')
c277df42 3276 count--;
830247a4 3277 RExC_parse++;
c277df42 3278 }
6136c704 3279 if (*RExC_parse != ')') {
b81d288d 3280 RExC_parse = s;
b45f050a
JF
3281 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3282 }
c277df42 3283 if (!SIZE_ONLY) {
f3548bdc 3284 PAD *pad;
6136c704
AL
3285 OP_4tree *sop, *rop;
3286 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3287
569233ed
SB
3288 ENTER;
3289 Perl_save_re_context(aTHX);
f3548bdc 3290 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3291 sop->op_private |= OPpREFCOUNTED;
3292 /* re_dup will OpREFCNT_inc */
3293 OpREFCNT_set(sop, 1);
569233ed 3294 LEAVE;
c277df42 3295
830247a4
IZ
3296 n = add_data(pRExC_state, 3, "nop");
3297 RExC_rx->data->data[n] = (void*)rop;
3298 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3299 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3300 SvREFCNT_dec(sv);
a0ed51b3 3301 }
e24b16f9 3302 else { /* First pass */
830247a4 3303 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3304 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3305 /* No compiled RE interpolated, has runtime
3306 components ===> unsafe. */
3307 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3308 if (PL_tainting && PL_tainted)
cc6b7395 3309 FAIL("Eval-group in insecure regular expression");
54df2634 3310#if PERL_VERSION > 8
923e4eb5 3311 if (IN_PERL_COMPILETIME)
b5c19bd7 3312 PL_cv_has_eval = 1;
54df2634 3313#endif
c277df42 3314 }
b5c19bd7 3315
830247a4 3316 nextchar(pRExC_state);
6136c704 3317 if (is_logical) {
830247a4 3318 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3319 if (!SIZE_ONLY)
3320 ret->flags = 2;
830247a4 3321 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3322 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3323 return ret;
3324 }
ccb2c380
MP
3325 ret = reganode(pRExC_state, EVAL, n);
3326 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3327 Set_Node_Offset(ret, parse_start);
3328 return ret;
c277df42 3329 }
fac92740 3330 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3331 {
fac92740 3332 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3333 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3334 || RExC_parse[1] == '<'
830247a4 3335 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3336 I32 flag;
3337
830247a4 3338 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3339 if (!SIZE_ONLY)
3340 ret->flags = 1;
830247a4 3341 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3342 goto insert_if;
b81d288d 3343 }
a0ed51b3 3344 }
830247a4 3345 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3346 /* (?(1)...) */
6136c704 3347 char c;
830247a4 3348 parno = atoi(RExC_parse++);
c277df42 3349
830247a4
IZ
3350 while (isDIGIT(*RExC_parse))
3351 RExC_parse++;
fac92740 3352 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3353
830247a4 3354 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3355 vFAIL("Switch condition not recognized");
c277df42 3356 insert_if:
830247a4
IZ
3357 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3358 br = regbranch(pRExC_state, &flags, 1);
c277df42 3359 if (br == NULL)
830247a4 3360 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3361 else
830247a4
IZ
3362 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3363 c = *nextchar(pRExC_state);
d1b80229
IZ
3364 if (flags&HASWIDTH)
3365 *flagp |= HASWIDTH;
c277df42 3366 if (c == '|') {
830247a4
IZ
3367 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3368 regbranch(pRExC_state, &flags, 1);
3369 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3370 if (flags&HASWIDTH)
3371 *flagp |= HASWIDTH;
830247a4 3372 c = *nextchar(pRExC_state);
a0ed51b3
LW
3373 }
3374 else
c277df42
IZ
3375 lastbr = NULL;
3376 if (c != ')')
8615cb43 3377 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3378 ender = reg_node(pRExC_state, TAIL);
3379 regtail(pRExC_state, br, ender);
c277df42 3380 if (lastbr) {
830247a4
IZ
3381 regtail(pRExC_state, lastbr, ender);
3382 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3383 }
3384 else
830247a4 3385 regtail(pRExC_state, ret, ender);
c277df42 3386 return ret;
a0ed51b3
LW
3387 }
3388 else {
830247a4 3389 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3390 }
3391 }
1b1626e4 3392 case 0:
830247a4 3393 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3394 vFAIL("Sequence (? incomplete");
1b1626e4 3395 break;
a0d0e21e 3396 default:
830247a4 3397 --RExC_parse;
fac92740 3398 parse_flags: /* (?i) */
830247a4 3399 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3400 /* (?g), (?gc) and (?o) are useless here
3401 and must be globally applied -- japhy */
3402
3403 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3404 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3405 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3406 if (! (wastedflags & wflagbit) ) {
3407 wastedflags |= wflagbit;
3408 vWARN5(
3409 RExC_parse + 1,
3410 "Useless (%s%c) - %suse /%c modifier",
3411 flagsp == &negflags ? "?-" : "?",
3412 *RExC_parse,
3413 flagsp == &negflags ? "don't " : "",
3414 *RExC_parse
3415 );
3416 }
3417 }
3418 }
3419 else if (*RExC_parse == 'c') {
3420 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3421 if (! (wastedflags & WASTED_C) ) {
3422 wastedflags |= WASTED_GC;
9d1d55b5
JP
3423 vWARN3(
3424 RExC_parse + 1,
3425 "Useless (%sc) - %suse /gc modifier",
3426 flagsp == &negflags ? "?-" : "?",
3427 flagsp == &negflags ? "don't " : ""
3428 );
3429 }
3430 }
3431 }
3432 else { pmflag(flagsp, *RExC_parse); }
3433
830247a4 3434 ++RExC_parse;
ca9dfc88 3435 }
830247a4 3436 if (*RExC_parse == '-') {
ca9dfc88 3437 flagsp = &negflags;
9d1d55b5 3438 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3439 ++RExC_parse;
ca9dfc88 3440 goto parse_flags;
48c036b1 3441 }
e2509266
JH
3442 RExC_flags |= posflags;
3443 RExC_flags &= ~negflags;
830247a4
IZ
3444 if (*RExC_parse == ':') {
3445 RExC_parse++;
ca9dfc88
IZ
3446 paren = ':';
3447 break;
3448 }
c277df42 3449 unknown:
830247a4
IZ
3450 if (*RExC_parse != ')') {
3451 RExC_parse++;
3452 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3453 }
830247a4 3454 nextchar(pRExC_state);
a0d0e21e
LW
3455 *flagp = TRYAGAIN;
3456 return NULL;
3457 }
3458 }
fac92740 3459 else { /* (...) */
830247a4
IZ
3460 parno = RExC_npar;
3461 RExC_npar++;
3462 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3463 Set_Node_Length(ret, 1); /* MJD */
3464 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3465 is_open = 1;
a0d0e21e 3466 }
a0ed51b3 3467 }
fac92740 3468 else /* ! paren */
a0d0e21e
LW
3469 ret = NULL;
3470
3471 /* Pick up the branches, linking them together. */
fac92740 3472 parse_start = RExC_parse; /* MJD */
830247a4 3473 br = regbranch(pRExC_state, &flags, 1);
fac92740 3474 /* branch_len = (paren != 0); */
2af232bd 3475
a0d0e21e
LW
3476 if (br == NULL)
3477 return(NULL);
830247a4
IZ
3478 if (*RExC_parse == '|') {
3479 if (!SIZE_ONLY && RExC_extralen) {
3480 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3481 }
fac92740 3482 else { /* MJD */
830247a4 3483 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3484 Set_Node_Length(br, paren != 0);
3485 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3486 }
c277df42
IZ
3487 have_branch = 1;
3488 if (SIZE_ONLY)
830247a4 3489 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3490 }
3491 else if (paren == ':') {
c277df42
IZ
3492 *flagp |= flags&SIMPLE;
3493 }
6136c704 3494 if (is_open) { /* Starts with OPEN. */
830247a4 3495 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3496 }
3497 else if (paren != '?') /* Not Conditional */
a0d0e21e 3498 ret = br;
32a0ca98 3499 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3500 lastbr = br;
830247a4
IZ
3501 while (*RExC_parse == '|') {
3502 if (!SIZE_ONLY && RExC_extralen) {
3503 ender = reganode(pRExC_state, LONGJMP,0);
3504 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3505 }
3506 if (SIZE_ONLY)
830247a4
IZ
3507 RExC_extralen += 2; /* Account for LONGJMP. */
3508 nextchar(pRExC_state);
3509 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3510
a687059c 3511 if (br == NULL)
a0d0e21e 3512 return(NULL);
830247a4 3513 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3514 lastbr = br;
821b33a5
IZ
3515 if (flags&HASWIDTH)
3516 *flagp |= HASWIDTH;
a687059c 3517 *flagp |= flags&SPSTART;
a0d0e21e
LW
3518 }
3519
c277df42
IZ
3520 if (have_branch || paren != ':') {
3521 /* Make a closing node, and hook it on the end. */
3522 switch (paren) {
3523 case ':':
830247a4 3524 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3525 break;
3526 case 1:
830247a4 3527 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3528 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3529 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3530 break;
3531 case '<':
c277df42
IZ
3532 case ',':
3533 case '=':
3534 case '!':
c277df42 3535 *flagp &= ~HASWIDTH;
821b33a5
IZ
3536 /* FALL THROUGH */
3537 case '>':
830247a4 3538 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3539 break;
3540 case 0:
830247a4 3541 ender = reg_node(pRExC_state, END);
c277df42
IZ
3542 break;
3543 }
830247a4 3544 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3545
9674d46a 3546 if (have_branch && !SIZE_ONLY) {
c277df42 3547 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
3548 for (br = ret; br; br = regnext(br)) {
3549 const U8 op = PL_regkind[OP(br)];
3550 if (op == BRANCH) {
3551 regtail(pRExC_state, NEXTOPER(br), ender);
3552 }
3553 else if (op == BRANCHJ) {
3554 regtail(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
3555 }
c277df42
IZ
3556 }
3557 }
a0d0e21e 3558 }
c277df42
IZ
3559
3560 {
e1ec3a88
AL
3561 const char *p;
3562 static const char parens[] = "=!<,>";
c277df42
IZ
3563
3564 if (paren && (p = strchr(parens, paren))) {
eb160463 3565 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;