This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Break out the item normalisation code into a method normalise_items.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a
JF
284/*
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
287 * "...".
288 */
ccb2c380 289#define FAIL2(pat,msg) STMT_START { \
bfed75c6 290 const char *ellipses = ""; \
ccb2c380
MP
291 IV len = RExC_end - RExC_precomp; \
292 \
293 if (!SIZE_ONLY) \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
298 ellipses = "..."; \
299 } \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
302} STMT_END
b45f050a
JF
303
304
305/*
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307 */
ccb2c380 308#define Simple_vFAIL(m) STMT_START { \
a28509cc 309 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
ccb2c380
MP
317#define vFAIL(m) STMT_START { \
318 if (!SIZE_ONLY) \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 Simple_vFAIL(m); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Like Simple_vFAIL(), but accepts two arguments.
325 */
ccb2c380 326#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 327 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
330} STMT_END
b45f050a
JF
331
332/*
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 */
ccb2c380
MP
335#define vFAIL2(m,a1) STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339} STMT_END
b45f050a
JF
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
ccb2c380 345#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 346 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353 */
ccb2c380
MP
354#define vFAIL3(m,a1,a2) STMT_START { \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts four arguments.
362 */
ccb2c380 363#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 364 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
b45f050a 368
ccb2c380 369#define vWARN(loc,m) STMT_START { \
a28509cc 370 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
a28509cc 376 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 384 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 390 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 396 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 402 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
a3621e74 420/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
551405c4 428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 442 __LINE__, (int)(node), (int)(len))); \
ccb2c380 443 if((node) < 0) { \
551405c4 444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
e1ec3a88
AL
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42 473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 474 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
0eda9292 498 {
a28509cc
AL
499 SV * const sv = data->last_found;
500 MAGIC * const mg =
0eda9292
JH
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
c277df42
IZ
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
653099ff
GS
509/* Can match anything (initialization) */
510STATIC void
830247a4 511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 512{
653099ff 513 ANYOF_CLASS_ZERO(cl);
f8bef550 514 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
a28509cc 522S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
653099ff
GS
523{
524 int value;
525
aaa51d5e 526 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
1aa99e6b
IH
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
f8bef550
NC
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
653099ff
GS
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
830247a4 538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 539{
8ecf7187 540 Zero(cl, 1, struct regnode_charclass_class);
653099ff 541 cl->type = ANYOF;
830247a4 542 cl_anything(pRExC_state, cl);
653099ff
GS
543}
544
545STATIC void
830247a4 546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 547{
8ecf7187 548 Zero(cl, 1, struct regnode_charclass_class);
653099ff 549 cl->type = ANYOF;
830247a4 550 cl_anything(pRExC_state, cl);
653099ff
GS
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
a28509cc 559 const struct regnode_charclass_class *and_with)
653099ff 560{
653099ff
GS
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
1aa99e6b 577
14ebb1a2
JH
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
14ebb1a2
JH
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 586 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
a28509cc 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
5d1c421c 652/*
a3621e74
YO
653
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
662
663Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
664
665A trie is an N'ary tree where the branches are determined by digital
666decomposition of the key. IE, at the root node you look up the 1st character and
667follow that branch repeat until you find the end of the branches. Nodes can be
668marked as "accepting" meaning they represent a complete word. Eg:
669
670 /he|she|his|hers/
671
672would convert into the following structure. Numbers represent states, letters
673following numbers represent valid transitions on the letter from that state, if
674the number is in square brackets it represents an accepting state, otherwise it
675will be in parenthesis.
676
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
678 | |
679 | (2)
680 | |
681 (1) +-i->(6)-+-s->[7]
682 |
683 +-s->(3)-+-h->(4)-+-e->[5]
684
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
686
687This shows that when matching against the string 'hers' we will begin at state 1
688read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691single traverse. We store a mapping from accepting to state to which word was
692matched, and then when we have multiple possibilities we try to complete the
693rest of the regex in the order in which they occured in the alternation.
694
695The only prior NFA like behaviour that would be changed by the TRIE support is
696the silent ignoring of duplicate alternations which are of the form:
697
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
699
700Thus EVAL blocks follwing a trie may be called a different number of times with
701and without the optimisation. With the optimisations dupes will be silently
702ignored. This inconsistant behaviour of EVAL type nodes is well established as
703the following demonstrates:
704
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
706
707which prints out 'word' three times, but
708
709 'words'=~/(word|word|word)(?{ print $1 })S/
710
711which doesnt print it out at all. This is due to other optimisations kicking in.
712
713Example of what happens on a structural level:
714
715The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
716
717 1: CURLYM[1] {1,32767}(18)
718 5: BRANCH(8)
719 6: EXACT <ac>(16)
720 8: BRANCH(11)
721 9: EXACT <ad>(16)
722 11: BRANCH(14)
723 12: EXACT <ab>(16)
724 16: SUCCEED(0)
725 17: NOTHING(18)
726 18: END(0)
727
728This would be optimizable with startbranch=5, first=5, last=16, tail=16
729and should turn into:
730
731 1: CURLYM[1] {1,32767}(18)
732 5: TRIE(16)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
734 <ac>
735 <ad>
736 <ab>
737 16: SUCCEED(0)
738 17: NOTHING(18)
739 18: END(0)
740
741Cases where tail != last would be like /(?foo|bar)baz/:
742
743 1: BRANCH(4)
744 2: EXACT <foo>(8)
745 4: BRANCH(7)
746 5: EXACT <bar>(8)
747 7: TAIL(8)
748 8: EXACT <baz>(10)
749 10: END(0)
750
751which would be optimizable with startbranch=1, first=1, last=7, tail=8
752and would end up looking like:
753
754 1: TRIE(8)
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
756 <foo>
757 <bar>
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762*/
763
764#define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
766 SV *tmp; \
767 if ( UTF ) { \
d0043bd1 768 tmp = newSVpvn( "", 0 ); \
a3621e74
YO
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
770 } else { \
e4584336 771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
772 } \
773 av_push( trie->revcharmap, tmp ); \
774 })
775
776#define TRIE_READ_CHAR STMT_START { \
777 if ( UTF ) { \
778 if ( folder ) { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
781 foldlen -= len; \
782 scan += len; \
783 len = 0; \
784 } else { \
e1ec3a88 785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
789 } \
790 } else { \
e1ec3a88 791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
792 } \
793 } else { \
794 uvc = (U32)*uc; \
795 len = 1; \
796 } \
797} STMT_END
798
799
800#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
804
805#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
810 } \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
814} STMT_END
815
816#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 817 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
821} STMT_END
822
823STATIC I32
824S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
825{
27da23d5 826 dVAR;
a3621e74
YO
827 /* first pass, loop through and scan words */
828 reg_trie_data *trie;
829 regnode *cur;
e1ec3a88 830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
831 STRLEN len = 0;
832 UV uvc = 0;
833 U16 curword = 0;
834 U32 next_alloc = 0;
835 /* we just use folder as a flag in utf8 */
e1ec3a88 836 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
837 ? PL_fold
838 : ( flags == EXACTFL
839 ? PL_fold_locale
840 : NULL
841 )
842 );
843
e1ec3a88 844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
845 SV *re_trie_maxbuff;
846
847 GET_RE_DEBUG_FLAGS_DECL;
848
a02a5408 849 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
850 trie->refcount = 1;
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 852 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
853 DEBUG_r({
854 trie->words = newAV();
855 trie->revcharmap = newAV();
856 });
857
858
0111c4fd 859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 860 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
862 }
863
864 /* -- First loop and Setup --
865
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
869 have unique chars.
870
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
874 coded index.
875
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
884
885 */
886
887
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode *noper = NEXTOPER( cur );
e1ec3a88 890 const U8 *uc = (U8*)STRING( noper );
a28509cc 891 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
892 STRLEN foldlen = 0;
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 894 const U8 *scan = (U8*)NULL;
a3621e74
YO
895
896 for ( ; uc < e ; uc += len ) {
897 trie->charcount++;
898 TRIE_READ_CHAR;
899 if ( uvc < 256 ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
902 if ( folder )
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
904 TRIE_DEBUG_CHAR;
905 }
906 } else {
907 SV** svpp;
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
910
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
912
913 if ( !svpp )
e4584336 914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
915
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
918 TRIE_DEBUG_CHAR;
919 }
920 }
921 }
922 trie->wordcount++;
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 927 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
928 );
929
930
931 /*
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
936 conservative but potentially much slower representation using an array
937 of lists.
938
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
945
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
949
950 */
951
952
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
954 /*
955 Second Pass -- Array Of Lists Representation
956
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
960
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
964
965
966 */
967
968
969 STRLEN transcount = 1;
970
a02a5408 971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
972 TRIE_LIST_NEW(1);
973 next_alloc = 2;
974
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
976
977 regnode *noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
a28509cc 979 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
985
986
987 for ( ; uc < e ; uc += len ) {
988
989 TRIE_READ_CHAR;
990
991 if ( uvc < 256 ) {
992 charid = trie->charmap[ uvc ];
993 } else {
994 SV** svpp=(SV**)NULL;
995 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
996 if ( !svpp ) {
997 charid = 0;
998 } else {
999 charid=(U16)SvIV( *svpp );
1000 }
1001 }
1002 if ( charid ) {
1003
1004 U16 check;
1005 U32 newstate = 0;
1006
1007 charid--;
1008 if ( !trie->states[ state ].trans.list ) {
1009 TRIE_LIST_NEW( state );
1010 }
1011 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1012 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1013 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1014 break;
1015 }
a28509cc
AL
1016 }
1017 if ( ! newstate ) {
1018 newstate = next_alloc++;
1019 TRIE_LIST_PUSH( state, charid, newstate );
1020 transcount++;
1021 }
1022 state = newstate;
a3621e74 1023 } else {
e4584336 1024 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1025 }
1026 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1027 }
1028
1029 if ( !trie->states[ state ].wordnum ) {
1030 /* we havent inserted this word into the structure yet. */
1031 trie->states[ state ].wordnum = ++curword;
1032
1033 DEBUG_r({
1034 /* store the word for dumping */
1035 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1036 if ( UTF ) SvUTF8_on( tmp );
1037 av_push( trie->words, tmp );
1038 });
1039
1040 } else {
1041 /* Its a dupe. So ignore it. */
1042 }
1043
1044 } /* end second pass */
1045
1046 trie->laststate = next_alloc;
1047 Renew( trie->states, next_alloc, reg_trie_state );
1048
1049 DEBUG_TRIE_COMPILE_MORE_r({
1050 U32 state;
a3621e74 1051
a28509cc 1052 /* print out the table precompression. */
a3621e74
YO
1053
1054 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1055 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1056
1057 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1058 U16 charid;
a3621e74 1059
e4584336 1060 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1061 if ( ! trie->states[ state ].wordnum ) {
1062 PerlIO_printf( Perl_debug_log, "%5s| ","");
1063 } else {
e4584336 1064 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1065 trie->states[ state ].wordnum
1066 );
1067 }
1068 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1069 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1070 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1071 SvPV_nolen_const( *tmp ),
a3621e74 1072 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1073 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1074 );
1075 }
1076
1077 }
1078 PerlIO_printf( Perl_debug_log, "\n\n" );
1079 });
1080
a02a5408 1081 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1082 {
1083 U32 state;
a3621e74
YO
1084 U32 tp = 0;
1085 U32 zp = 0;
1086
1087
1088 for( state=1 ; state < next_alloc ; state ++ ) {
1089 U32 base=0;
1090
1091 /*
1092 DEBUG_TRIE_COMPILE_MORE_r(
1093 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1094 );
1095 */
1096
1097 if (trie->states[state].trans.list) {
1098 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1099 U16 maxid=minid;
a28509cc 1100 U16 idx;
a3621e74
YO
1101
1102 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1103 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1104 minid=TRIE_LIST_ITEM( state, idx).forid;
1105 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1106 maxid=TRIE_LIST_ITEM( state, idx).forid;
1107 }
1108 }
1109 if ( transcount < tp + maxid - minid + 1) {
1110 transcount *= 2;
1111 Renew( trie->trans, transcount, reg_trie_trans );
1112 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1113 }
1114 base = trie->uniquecharcount + tp - minid;
1115 if ( maxid == minid ) {
1116 U32 set = 0;
1117 for ( ; zp < tp ; zp++ ) {
1118 if ( ! trie->trans[ zp ].next ) {
1119 base = trie->uniquecharcount + zp - minid;
1120 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1121 trie->trans[ zp ].check = state;
1122 set = 1;
1123 break;
1124 }
1125 }
1126 if ( !set ) {
1127 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1128 trie->trans[ tp ].check = state;
1129 tp++;
1130 zp = tp;
1131 }
1132 } else {
1133 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1134 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1135 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1136 trie->trans[ tid ].check = state;
1137 }
1138 tp += ( maxid - minid + 1 );
1139 }
1140 Safefree(trie->states[ state ].trans.list);
1141 }
1142 /*
1143 DEBUG_TRIE_COMPILE_MORE_r(
1144 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1145 );
1146 */
1147 trie->states[ state ].trans.base=base;
1148 }
cc601c31 1149 trie->lasttrans = tp + 1;
a3621e74
YO
1150 }
1151 } else {
1152 /*
1153 Second Pass -- Flat Table Representation.
1154
1155 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1156 We know that we will need Charcount+1 trans at most to store the data
1157 (one row per char at worst case) So we preallocate both structures
1158 assuming worst case.
1159
1160 We then construct the trie using only the .next slots of the entry
1161 structs.
1162
1163 We use the .check field of the first entry of the node temporarily to
1164 make compression both faster and easier by keeping track of how many non
1165 zero fields are in the node.
1166
1167 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1168 transition.
1169
1170 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1171 number representing the first entry of the node, and state as a
1172 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1173 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1174 are 2 entrys per node. eg:
1175
1176 A B A B
1177 1. 2 4 1. 3 7
1178 2. 0 3 3. 0 5
1179 3. 0 0 5. 0 0
1180 4. 0 0 7. 0 0
1181
1182 The table is internally in the right hand, idx form. However as we also
1183 have to deal with the states array which is indexed by nodenum we have to
1184 use TRIE_NODENUM() to convert.
1185
1186 */
1187
a02a5408 1188 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1189 reg_trie_trans );
a02a5408 1190 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1191 next_alloc = trie->uniquecharcount + 1;
1192
1193 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1194
1195 regnode *noper = NEXTOPER( cur );
a28509cc
AL
1196 const U8 *uc = (U8*)STRING( noper );
1197 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1198
1199 U32 state = 1; /* required init */
1200
1201 U16 charid = 0; /* sanity init */
1202 U32 accept_state = 0; /* sanity init */
1203 U8 *scan = (U8*)NULL; /* sanity init */
1204
1205 STRLEN foldlen = 0; /* required init */
1206 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1207
1208
1209 for ( ; uc < e ; uc += len ) {
1210
1211 TRIE_READ_CHAR;
1212
1213 if ( uvc < 256 ) {
1214 charid = trie->charmap[ uvc ];
1215 } else {
1216 SV** svpp=(SV**)NULL;
1217 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1218 if ( !svpp ) {
1219 charid = 0;
1220 } else {
1221 charid=(U16)SvIV( *svpp );
1222 }
1223 }
1224 if ( charid ) {
1225 charid--;
1226 if ( !trie->trans[ state + charid ].next ) {
1227 trie->trans[ state + charid ].next = next_alloc;
1228 trie->trans[ state ].check++;
1229 next_alloc += trie->uniquecharcount;
1230 }
1231 state = trie->trans[ state + charid ].next;
1232 } else {
e4584336 1233 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1234 }
1235 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1236 }
1237
1238 accept_state = TRIE_NODENUM( state );
1239 if ( !trie->states[ accept_state ].wordnum ) {
1240 /* we havent inserted this word into the structure yet. */
1241 trie->states[ accept_state ].wordnum = ++curword;
1242
1243 DEBUG_r({
1244 /* store the word for dumping */
1245 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1246 if ( UTF ) SvUTF8_on( tmp );
1247 av_push( trie->words, tmp );
1248 });
1249
1250 } else {
1251 /* Its a dupe. So ignore it. */
1252 }
1253
1254 } /* end second pass */
1255
1256 DEBUG_TRIE_COMPILE_MORE_r({
1257 /*
1258 print out the table precompression so that we can do a visual check
1259 that they are identical.
1260 */
1261 U32 state;
1262 U16 charid;
1263 PerlIO_printf( Perl_debug_log, "\nChar : " );
1264
1265 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1266 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1267 if ( tmp ) {
cfd0369c 1268 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1269 }
1270 }
1271
1272 PerlIO_printf( Perl_debug_log, "\nState+-" );
1273
1274 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1275 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1276 }
1277
1278 PerlIO_printf( Perl_debug_log, "\n" );
1279
1280 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1281
e4584336 1282 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1283
1284 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1285 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1286 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1287 }
1288 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1289 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1290 } else {
e4584336 1291 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1292 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1293 }
1294 }
1295 PerlIO_printf( Perl_debug_log, "\n\n" );
1296 });
1297 {
1298 /*
1299 * Inplace compress the table.*
1300
1301 For sparse data sets the table constructed by the trie algorithm will
1302 be mostly 0/FAIL transitions or to put it another way mostly empty.
1303 (Note that leaf nodes will not contain any transitions.)
1304
1305 This algorithm compresses the tables by eliminating most such
1306 transitions, at the cost of a modest bit of extra work during lookup:
1307
1308 - Each states[] entry contains a .base field which indicates the
1309 index in the state[] array wheres its transition data is stored.
1310
1311 - If .base is 0 there are no valid transitions from that node.
1312
1313 - If .base is nonzero then charid is added to it to find an entry in
1314 the trans array.
1315
1316 -If trans[states[state].base+charid].check!=state then the
1317 transition is taken to be a 0/Fail transition. Thus if there are fail
1318 transitions at the front of the node then the .base offset will point
1319 somewhere inside the previous nodes data (or maybe even into a node
1320 even earlier), but the .check field determines if the transition is
1321 valid.
1322
1323 The following process inplace converts the table to the compressed
1324 table: We first do not compress the root node 1,and mark its all its
1325 .check pointers as 1 and set its .base pointer as 1 as well. This
1326 allows to do a DFA construction from the compressed table later, and
1327 ensures that any .base pointers we calculate later are greater than
1328 0.
1329
1330 - We set 'pos' to indicate the first entry of the second node.
1331
1332 - We then iterate over the columns of the node, finding the first and
1333 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1334 and set the .check pointers accordingly, and advance pos
1335 appropriately and repreat for the next node. Note that when we copy
1336 the next pointers we have to convert them from the original
1337 NODEIDX form to NODENUM form as the former is not valid post
1338 compression.
1339
1340 - If a node has no transitions used we mark its base as 0 and do not
1341 advance the pos pointer.
1342
1343 - If a node only has one transition we use a second pointer into the
1344 structure to fill in allocated fail transitions from other states.
1345 This pointer is independent of the main pointer and scans forward
1346 looking for null transitions that are allocated to a state. When it
1347 finds one it writes the single transition into the "hole". If the
1348 pointer doesnt find one the single transition is appeneded as normal.
1349
1350 - Once compressed we can Renew/realloc the structures to release the
1351 excess space.
1352
1353 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1354 specifically Fig 3.47 and the associated pseudocode.
1355
1356 demq
1357 */
a3b680e6 1358 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1359 U32 state, charid;
a3621e74
YO
1360 U32 pos = 0, zp=0;
1361 trie->laststate = laststate;
1362
1363 for ( state = 1 ; state < laststate ; state++ ) {
1364 U8 flag = 0;
a28509cc
AL
1365 const U32 stateidx = TRIE_NODEIDX( state );
1366 const U32 o_used = trie->trans[ stateidx ].check;
1367 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1368 trie->trans[ stateidx ].check = 0;
1369
1370 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1371 if ( flag || trie->trans[ stateidx + charid ].next ) {
1372 if ( trie->trans[ stateidx + charid ].next ) {
1373 if (o_used == 1) {
1374 for ( ; zp < pos ; zp++ ) {
1375 if ( ! trie->trans[ zp ].next ) {
1376 break;
1377 }
1378 }
1379 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1380 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1381 trie->trans[ zp ].check = state;
1382 if ( ++zp > pos ) pos = zp;
1383 break;
1384 }
1385 used--;
1386 }
1387 if ( !flag ) {
1388 flag = 1;
1389 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1390 }
1391 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1392 trie->trans[ pos ].check = state;
1393 pos++;
1394 }
1395 }
1396 }
cc601c31 1397 trie->lasttrans = pos + 1;
a3621e74
YO
1398 Renew( trie->states, laststate + 1, reg_trie_state);
1399 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1400 PerlIO_printf( Perl_debug_log,
1401 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1402 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1403 (IV)next_alloc,
1404 (IV)pos,
a3621e74
YO
1405 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1406 );
1407
1408 } /* end table compress */
1409 }
cc601c31
YO
1410 /* resize the trans array to remove unused space */
1411 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1412
1413 DEBUG_TRIE_COMPILE_r({
1414 U32 state;
1415 /*
1416 Now we print it out again, in a slightly different form as there is additional
1417 info we want to be able to see when its compressed. They are close enough for
1418 visual comparison though.
1419 */
1420 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1421
1422 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1423 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1424 if ( tmp ) {
cfd0369c 1425 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1426 }
1427 }
1428 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1429
a3621e74
YO
1430 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1431 PerlIO_printf( Perl_debug_log, "-----");
1432 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1433
a3621e74 1434 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1435 const U32 base = trie->states[ state ].trans.base;
a3621e74 1436
e4584336 1437 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1438
1439 if ( trie->states[ state ].wordnum ) {
1440 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1441 } else {
1442 PerlIO_printf( Perl_debug_log, "%6s", "" );
1443 }
1444
e4584336 1445 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1446
1447 if ( base ) {
1448 U32 ofs = 0;
1449
cc601c31
YO
1450 while( ( base + ofs < trie->uniquecharcount ) ||
1451 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1452 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1453 ofs++;
1454
e4584336 1455 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1456
1457 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1458 if ( ( base + ofs >= trie->uniquecharcount ) &&
1459 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1460 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1461 {
e4584336
RB
1462 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1463 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1464 } else {
1465 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1466 }
1467 }
1468
e4584336 1469 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1470
1471 }
1472 PerlIO_printf( Perl_debug_log, "\n" );
1473 }
1474 });
1475
1476 {
1477 /* now finally we "stitch in" the new TRIE node
1478 This means we convert either the first branch or the first Exact,
1479 depending on whether the thing following (in 'last') is a branch
1480 or not and whther first is the startbranch (ie is it a sub part of
1481 the alternation or is it the whole thing.)
1482 Assuming its a sub part we conver the EXACT otherwise we convert
1483 the whole branch sequence, including the first.
1484 */
1485 regnode *convert;
1486
1487
1488
1489
1490 if ( first == startbranch && OP( last ) != BRANCH ) {
1491 convert = first;
1492 } else {
1493 convert = NEXTOPER( first );
1494 NEXT_OFF( first ) = (U16)(last - first);
1495 }
1496
1497 OP( convert ) = TRIE + (U8)( flags - EXACT );
1498 NEXT_OFF( convert ) = (U16)(tail - convert);
1499 ARG_SET( convert, data_slot );
1500
1501 /* tells us if we need to handle accept buffers specially */
1502 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1503
1504
1505 /* needed for dumping*/
1506 DEBUG_r({
1507 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1508 /* We now need to mark all of the space originally used by the
1509 branches as optimized away. This keeps the dumpuntil from
1510 throwing a wobbly as it doesnt use regnext() to traverse the
1511 opcodes.
1512 */
1513 while( optimize < last ) {
1514 OP( optimize ) = OPTIMIZED;
1515 optimize++;
1516 }
1517 });
1518 } /* end node insert */
1519 return 1;
1520}
1521
1522
1523
1524/*
5d1c421c
JH
1525 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1526 * These need to be revisited when a newer toolchain becomes available.
1527 */
1528#if defined(__sparc64__) && defined(__GNUC__)
1529# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1530# undef SPARC64_GCC_WORKAROUND
1531# define SPARC64_GCC_WORKAROUND 1
1532# endif
1533#endif
1534
653099ff
GS
1535/* REx optimizer. Converts nodes into quickier variants "in place".
1536 Finds fixed substrings. */
1537
a0288114 1538/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1539 to the position after last scanned or to NULL. */
1540
a3621e74 1541
76e3520e 1542STATIC I32
9a957fbc
AL
1543S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1544 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1545 /* scanp: Start here (read-write). */
1546 /* deltap: Write maxlen-minlen here. */
1547 /* last: Stop before this one. */
1548{
1549 I32 min = 0, pars = 0, code;
1550 regnode *scan = *scanp, *next;
1551 I32 delta = 0;
1552 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1553 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1554 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1555 scan_data_t data_fake;
653099ff 1556 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1557 SV *re_trie_maxbuff = NULL;
1558
1559 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1560
c277df42
IZ
1561 while (scan && OP(scan) != END && scan < last) {
1562 /* Peephole optimizer: */
a3621e74
YO
1563 DEBUG_OPTIMISE_r({
1564 SV *mysv=sv_newmortal();
1565 regprop( mysv, scan);
e4584336 1566 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1567 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1568 });
c277df42 1569
22c35a8c 1570 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1571 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1572 regnode *n = regnext(scan);
1573 U32 stringok = 1;
1574#ifdef DEBUGGING
1575 regnode *stop = scan;
b81d288d 1576#endif
c277df42 1577
cd439c50 1578 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1579 /* Skip NOTHING, merge EXACT*. */
1580 while (n &&
b81d288d 1581 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1582 (stringok && (OP(n) == OP(scan))))
1583 && NEXT_OFF(n)
1584 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1585 if (OP(n) == TAIL || n > next)
1586 stringok = 0;
22c35a8c 1587 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1588 NEXT_OFF(scan) += NEXT_OFF(n);
1589 next = n + NODE_STEP_REGNODE;
1590#ifdef DEBUGGING
1591 if (stringok)
1592 stop = n;
b81d288d 1593#endif
c277df42 1594 n = regnext(n);
a0ed51b3 1595 }
f49d4d0f 1596 else if (stringok) {
a3b680e6 1597 const int oldl = STR_LEN(scan);
c277df42 1598 regnode *nnext = regnext(n);
f49d4d0f 1599
b81d288d 1600 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1601 break;
1602 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1603 STR_LEN(scan) += STR_LEN(n);
1604 next = n + NODE_SZ_STR(n);
c277df42 1605 /* Now we can overwrite *n : */
f49d4d0f 1606 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1607#ifdef DEBUGGING
f49d4d0f 1608 stop = next - 1;
b81d288d 1609#endif
c277df42
IZ
1610 n = nnext;
1611 }
1612 }
61a36c01 1613
a3621e74 1614 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1615/*
1616 Two problematic code points in Unicode casefolding of EXACT nodes:
1617
1618 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1619 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1620
1621 which casefold to
1622
1623 Unicode UTF-8
1624
1625 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1626 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1627
1628 This means that in case-insensitive matching (or "loose matching",
1629 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1630 length of the above casefolded versions) can match a target string
1631 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1632 This would rather mess up the minimum length computation.
1633
1634 What we'll do is to look for the tail four bytes, and then peek
1635 at the preceding two bytes to see whether we need to decrease
1636 the minimum length by four (six minus two).
1637
1638 Thanks to the design of UTF-8, there cannot be false matches:
1639 A sequence of valid UTF-8 bytes cannot be a subsequence of
1640 another valid sequence of UTF-8 bytes.
1641
1642*/
1643 char *s0 = STRING(scan), *s, *t;
1644 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
a28509cc
AL
1645 const char * const t0 = "\xcc\x88\xcc\x81";
1646 const char * const t1 = t0 + 3;
2af232bd 1647
61a36c01
JH
1648 for (s = s0 + 2;
1649 s < s2 && (t = ninstr(s, s1, t0, t1));
1650 s = t + 4) {
1651 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1652 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1653 min -= 4;
1654 }
1655 }
1656
c277df42
IZ
1657#ifdef DEBUGGING
1658 /* Allow dumping */
cd439c50 1659 n = scan + NODE_SZ_STR(scan);
c277df42 1660 while (n <= stop) {
22c35a8c 1661 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1662 OP(n) = OPTIMIZED;
1663 NEXT_OFF(n) = 0;
1664 }
1665 n++;
1666 }
653099ff 1667#endif
c277df42 1668 }
a3621e74
YO
1669
1670
1671
653099ff
GS
1672 /* Follow the next-chain of the current node and optimize
1673 away all the NOTHINGs from it. */
c277df42 1674 if (OP(scan) != CURLYX) {
a3b680e6 1675 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1676 ? I32_MAX
1677 /* I32 may be smaller than U16 on CRAYs! */
1678 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1679 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1680 int noff;
1681 regnode *n = scan;
b81d288d 1682
c277df42
IZ
1683 /* Skip NOTHING and LONGJMP. */
1684 while ((n = regnext(n))
22c35a8c 1685 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1686 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1687 && off + noff < max)
1688 off += noff;
1689 if (reg_off_by_arg[OP(scan)])
1690 ARG(scan) = off;
b81d288d 1691 else
c277df42
IZ
1692 NEXT_OFF(scan) = off;
1693 }
a3621e74 1694
653099ff
GS
1695 /* The principal pseudo-switch. Cannot be a switch, since we
1696 look into several different things. */
b81d288d 1697 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1698 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1699 next = regnext(scan);
1700 code = OP(scan);
a3621e74 1701 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1702
1703 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1704 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1705 struct regnode_charclass_class accum;
a3621e74 1706 regnode *startbranch=scan;
c277df42 1707
653099ff 1708 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1709 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1710 if (flags & SCF_DO_STCLASS)
830247a4 1711 cl_init_zero(pRExC_state, &accum);
a3621e74 1712
c277df42 1713 while (OP(scan) == code) {
830247a4 1714 I32 deltanext, minnext, f = 0, fake;
653099ff 1715 struct regnode_charclass_class this_class;
c277df42
IZ
1716
1717 num++;
1718 data_fake.flags = 0;
b81d288d 1719 if (data) {
2c2d71f5 1720 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1721 data_fake.last_closep = data->last_closep;
1722 }
1723 else
1724 data_fake.last_closep = &fake;
c277df42
IZ
1725 next = regnext(scan);
1726 scan = NEXTOPER(scan);
1727 if (code != BRANCH)
1728 scan = NEXTOPER(scan);
653099ff 1729 if (flags & SCF_DO_STCLASS) {
830247a4 1730 cl_init(pRExC_state, &this_class);
653099ff
GS
1731 data_fake.start_class = &this_class;
1732 f = SCF_DO_STCLASS_AND;
b81d288d 1733 }
e1901655
IZ
1734 if (flags & SCF_WHILEM_VISITED_POS)
1735 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1736
653099ff 1737 /* we suppose the run is continuous, last=next...*/
830247a4 1738 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1739 next, &data_fake, f,depth+1);
b81d288d 1740 if (min1 > minnext)
c277df42
IZ
1741 min1 = minnext;
1742 if (max1 < minnext + deltanext)
1743 max1 = minnext + deltanext;
1744 if (deltanext == I32_MAX)
aca2d497 1745 is_inf = is_inf_internal = 1;
c277df42
IZ
1746 scan = next;
1747 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1748 pars++;
405ff068 1749 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1750 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1751 if (data)
1752 data->whilem_c = data_fake.whilem_c;
653099ff 1753 if (flags & SCF_DO_STCLASS)
830247a4 1754 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1755 if (code == SUSPEND)
c277df42
IZ
1756 break;
1757 }
1758 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1759 min1 = 0;
1760 if (flags & SCF_DO_SUBSTR) {
1761 data->pos_min += min1;
1762 data->pos_delta += max1 - min1;
1763 if (max1 != min1 || is_inf)
1764 data->longest = &(data->longest_float);
1765 }
1766 min += min1;
1767 delta += max1 - min1;
653099ff 1768 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1769 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1770 if (min1) {
1771 cl_and(data->start_class, &and_with);
1772 flags &= ~SCF_DO_STCLASS;
1773 }
1774 }
1775 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1776 if (min1) {
1777 cl_and(data->start_class, &accum);
653099ff 1778 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1779 }
1780 else {
b81d288d 1781 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1782 * data->start_class */
1783 StructCopy(data->start_class, &and_with,
1784 struct regnode_charclass_class);
1785 flags &= ~SCF_DO_STCLASS_AND;
1786 StructCopy(&accum, data->start_class,
1787 struct regnode_charclass_class);
1788 flags |= SCF_DO_STCLASS_OR;
1789 data->start_class->flags |= ANYOF_EOS;
1790 }
653099ff 1791 }
a3621e74
YO
1792
1793 /* demq.
1794
1795 Assuming this was/is a branch we are dealing with: 'scan' now
1796 points at the item that follows the branch sequence, whatever
1797 it is. We now start at the beginning of the sequence and look
1798 for subsequences of
1799
1800 BRANCH->EXACT=>X
1801 BRANCH->EXACT=>X
1802
1803 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1804
1805 If we can find such a subseqence we need to turn the first
1806 element into a trie and then add the subsequent branch exact
1807 strings to the trie.
1808
1809 We have two cases
1810
1811 1. patterns where the whole set of branch can be converted to a trie,
1812
1813 2. patterns where only a subset of the alternations can be
1814 converted to a trie.
1815
1816 In case 1 we can replace the whole set with a single regop
1817 for the trie. In case 2 we need to keep the start and end
1818 branchs so
1819
1820 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1821 becomes BRANCH TRIE; BRANCH X;
1822
1823 Hypthetically when we know the regex isnt anchored we can
1824 turn a case 1 into a DFA and let it rip... Every time it finds a match
1825 it would just call its tail, no WHILEM/CURLY needed.
1826
1827 */
0111c4fd
RGS
1828 if (DO_TRIE) {
1829 if (!re_trie_maxbuff) {
1830 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1831 if (!SvIOK(re_trie_maxbuff))
1832 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1833 }
a3621e74
YO
1834 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1835 regnode *cur;
1836 regnode *first = (regnode *)NULL;
1837 regnode *last = (regnode *)NULL;
1838 regnode *tail = scan;
1839 U8 optype = 0;
1840 U32 count=0;
1841
1842#ifdef DEBUGGING
1843 SV *mysv = sv_newmortal(); /* for dumping */
1844#endif
1845 /* var tail is used because there may be a TAIL
1846 regop in the way. Ie, the exacts will point to the
1847 thing following the TAIL, but the last branch will
1848 point at the TAIL. So we advance tail. If we
1849 have nested (?:) we may have to move through several
1850 tails.
1851 */
1852
1853 while ( OP( tail ) == TAIL ) {
1854 /* this is the TAIL generated by (?:) */
1855 tail = regnext( tail );
1856 }
1857
1858 DEBUG_OPTIMISE_r({
1859 regprop( mysv, tail );
1860 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1861 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1862 (RExC_seen_evals) ? "[EVAL]" : ""
1863 );
1864 });
1865 /*
1866
1867 step through the branches, cur represents each
1868 branch, noper is the first thing to be matched
1869 as part of that branch and noper_next is the
1870 regnext() of that node. if noper is an EXACT
1871 and noper_next is the same as scan (our current
1872 position in the regex) then the EXACT branch is
1873 a possible optimization target. Once we have
1874 two or more consequetive such branches we can
1875 create a trie of the EXACT's contents and stich
1876 it in place. If the sequence represents all of
1877 the branches we eliminate the whole thing and
1878 replace it with a single TRIE. If it is a
1879 subsequence then we need to stitch it in. This
1880 means the first branch has to remain, and needs
1881 to be repointed at the item on the branch chain
1882 following the last branch optimized. This could
1883 be either a BRANCH, in which case the
1884 subsequence is internal, or it could be the
1885 item following the branch sequence in which
1886 case the subsequence is at the end.
1887
1888 */
1889
1890 /* dont use tail as the end marker for this traverse */
1891 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1892 regnode * const noper = NEXTOPER( cur );
1893 regnode * const noper_next = regnext( noper );
a3621e74 1894
a3621e74
YO
1895 DEBUG_OPTIMISE_r({
1896 regprop( mysv, cur);
1897 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1898 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74
YO
1899
1900 regprop( mysv, noper);
1901 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1902 SvPV_nolen_const(mysv));
a3621e74
YO
1903
1904 if ( noper_next ) {
1905 regprop( mysv, noper_next );
1906 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1907 SvPV_nolen_const(mysv));
a3621e74
YO
1908 }
1909 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1910 first, last, cur );
1911 });
1912 if ( ( first ? OP( noper ) == optype
1913 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1914 && noper_next == tail && count<U16_MAX)
1915 {
1916 count++;
1917 if ( !first ) {
1918 first = cur;
1919 optype = OP( noper );
1920 } else {
1921 DEBUG_OPTIMISE_r(
1922 if (!last ) {
1923 regprop( mysv, first);
1924 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1925 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1926 regprop( mysv, NEXTOPER(first) );
1927 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1928 SvPV_nolen_const( mysv ) );
a3621e74
YO
1929 }
1930 );
1931 last = cur;
1932 DEBUG_OPTIMISE_r({
1933 regprop( mysv, cur);
1934 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1935 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1936 regprop( mysv, noper );
1937 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1938 SvPV_nolen_const( mysv ) );
a3621e74
YO
1939 });
1940 }
1941 } else {
1942 if ( last ) {
1943 DEBUG_OPTIMISE_r(
1944 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1945 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1946 );
1947 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1948 }
1949 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1950 && noper_next == tail )
1951 {
1952 count = 1;
1953 first = cur;
1954 optype = OP( noper );
1955 } else {
1956 count = 0;
1957 first = NULL;
1958 optype = 0;
1959 }
1960 last = NULL;
1961 }
1962 }
1963 DEBUG_OPTIMISE_r({
1964 regprop( mysv, cur);
1965 PerlIO_printf( Perl_debug_log,
e4584336 1966 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1967 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1968
1969 });
1970 if ( last ) {
1971 DEBUG_OPTIMISE_r(
1972 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1973 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1974 );
1975 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1976 }
1977 }
1978 }
a0ed51b3 1979 }
a3621e74 1980 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1981 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1982 } else /* single branch is optimized. */
c277df42
IZ
1983 scan = NEXTOPER(scan);
1984 continue;
a0ed51b3
LW
1985 }
1986 else if (OP(scan) == EXACT) {
cd439c50 1987 I32 l = STR_LEN(scan);
1aa99e6b 1988 UV uc = *((U8*)STRING(scan));
a0ed51b3 1989 if (UTF) {
a3b680e6 1990 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1991 l = utf8_length(s, s + l);
9041c2e3 1992 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
1993 }
1994 min += l;
c277df42 1995 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1996 /* The code below prefers earlier match for fixed
1997 offset, later match for variable offset. */
1998 if (data->last_end == -1) { /* Update the start info. */
1999 data->last_start_min = data->pos_min;
2000 data->last_start_max = is_inf
b81d288d 2001 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2002 }
cd439c50 2003 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292 2004 {
9a957fbc 2005 SV * const sv = data->last_found;
a28509cc 2006 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2007 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2008 if (mg && mg->mg_len >= 0)
5e43f467
JH
2009 mg->mg_len += utf8_length((U8*)STRING(scan),
2010 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2011 }
33b8afdf
JH
2012 if (UTF)
2013 SvUTF8_on(data->last_found);
c277df42
IZ
2014 data->last_end = data->pos_min + l;
2015 data->pos_min += l; /* As in the first entry. */
2016 data->flags &= ~SF_BEFORE_EOL;
2017 }
653099ff
GS
2018 if (flags & SCF_DO_STCLASS_AND) {
2019 /* Check whether it is compatible with what we know already! */
2020 int compat = 1;
2021
1aa99e6b 2022 if (uc >= 0x100 ||
516a5887 2023 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2024 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2025 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2026 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2027 )
653099ff
GS
2028 compat = 0;
2029 ANYOF_CLASS_ZERO(data->start_class);
2030 ANYOF_BITMAP_ZERO(data->start_class);
2031 if (compat)
1aa99e6b 2032 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2033 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2034 if (uc < 0x100)
2035 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2036 }
2037 else if (flags & SCF_DO_STCLASS_OR) {
2038 /* false positive possible if the class is case-folded */
1aa99e6b 2039 if (uc < 0x100)
9b877dbb
IH
2040 ANYOF_BITMAP_SET(data->start_class, uc);
2041 else
2042 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2043 data->start_class->flags &= ~ANYOF_EOS;
2044 cl_and(data->start_class, &and_with);
2045 }
2046 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2047 }
653099ff 2048 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2049 I32 l = STR_LEN(scan);
1aa99e6b 2050 UV uc = *((U8*)STRING(scan));
653099ff
GS
2051
2052 /* Search for fixed substrings supports EXACT only. */
b81d288d 2053 if (flags & SCF_DO_SUBSTR)
830247a4 2054 scan_commit(pRExC_state, data);
a0ed51b3 2055 if (UTF) {
1aa99e6b
IH
2056 U8 *s = (U8 *)STRING(scan);
2057 l = utf8_length(s, s + l);
9041c2e3 2058 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2059 }
2060 min += l;
c277df42 2061 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2062 data->pos_min += l;
653099ff
GS
2063 if (flags & SCF_DO_STCLASS_AND) {
2064 /* Check whether it is compatible with what we know already! */
2065 int compat = 1;
2066
1aa99e6b 2067 if (uc >= 0x100 ||
516a5887 2068 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2069 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2070 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2071 compat = 0;
2072 ANYOF_CLASS_ZERO(data->start_class);
2073 ANYOF_BITMAP_ZERO(data->start_class);
2074 if (compat) {
1aa99e6b 2075 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2076 data->start_class->flags &= ~ANYOF_EOS;
2077 data->start_class->flags |= ANYOF_FOLD;
2078 if (OP(scan) == EXACTFL)
2079 data->start_class->flags |= ANYOF_LOCALE;
2080 }
2081 }
2082 else if (flags & SCF_DO_STCLASS_OR) {
2083 if (data->start_class->flags & ANYOF_FOLD) {
2084 /* false positive possible if the class is case-folded.
2085 Assume that the locale settings are the same... */
1aa99e6b
IH
2086 if (uc < 0x100)
2087 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2088 data->start_class->flags &= ~ANYOF_EOS;
2089 }
2090 cl_and(data->start_class, &and_with);
2091 }
2092 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2093 }
bfed75c6 2094 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2095 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2096 I32 f = flags, pos_before = 0;
c277df42 2097 regnode *oscan = scan;
653099ff
GS
2098 struct regnode_charclass_class this_class;
2099 struct regnode_charclass_class *oclass = NULL;
727f22e3 2100 I32 next_is_eval = 0;
653099ff 2101
22c35a8c 2102 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2103 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2104 scan = NEXTOPER(scan);
2105 goto finish;
2106 case PLUS:
653099ff 2107 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2108 next = NEXTOPER(scan);
653099ff 2109 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2110 mincount = 1;
2111 maxcount = REG_INFTY;
c277df42
IZ
2112 next = regnext(scan);
2113 scan = NEXTOPER(scan);
2114 goto do_curly;
2115 }
2116 }
2117 if (flags & SCF_DO_SUBSTR)
2118 data->pos_min++;
2119 min++;
2120 /* Fall through. */
2121 case STAR:
653099ff
GS
2122 if (flags & SCF_DO_STCLASS) {
2123 mincount = 0;
b81d288d 2124 maxcount = REG_INFTY;
653099ff
GS
2125 next = regnext(scan);
2126 scan = NEXTOPER(scan);
2127 goto do_curly;
2128 }
b81d288d 2129 is_inf = is_inf_internal = 1;
c277df42
IZ
2130 scan = regnext(scan);
2131 if (flags & SCF_DO_SUBSTR) {
830247a4 2132 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2133 data->longest = &(data->longest_float);
2134 }
2135 goto optimize_curly_tail;
2136 case CURLY:
b81d288d 2137 mincount = ARG1(scan);
c277df42
IZ
2138 maxcount = ARG2(scan);
2139 next = regnext(scan);
cb434fcc
IZ
2140 if (OP(scan) == CURLYX) {
2141 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2142 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2143 }
c277df42 2144 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2145 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2146 do_curly:
2147 if (flags & SCF_DO_SUBSTR) {
830247a4 2148 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2149 pos_before = data->pos_min;
2150 }
2151 if (data) {
2152 fl = data->flags;
2153 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2154 if (is_inf)
2155 data->flags |= SF_IS_INF;
2156 }
653099ff 2157 if (flags & SCF_DO_STCLASS) {
830247a4 2158 cl_init(pRExC_state, &this_class);
653099ff
GS
2159 oclass = data->start_class;
2160 data->start_class = &this_class;
2161 f |= SCF_DO_STCLASS_AND;
2162 f &= ~SCF_DO_STCLASS_OR;
2163 }
e1901655
IZ
2164 /* These are the cases when once a subexpression
2165 fails at a particular position, it cannot succeed
2166 even after backtracking at the enclosing scope.
b81d288d 2167
e1901655
IZ
2168 XXXX what if minimal match and we are at the
2169 initial run of {n,m}? */
2170 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2171 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2172
c277df42 2173 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2174 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2175 (mincount == 0
2176 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2177
2178 if (flags & SCF_DO_STCLASS)
2179 data->start_class = oclass;
2180 if (mincount == 0 || minnext == 0) {
2181 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2182 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2183 }
2184 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2185 /* Switch to OR mode: cache the old value of
653099ff
GS
2186 * data->start_class */
2187 StructCopy(data->start_class, &and_with,
2188 struct regnode_charclass_class);
2189 flags &= ~SCF_DO_STCLASS_AND;
2190 StructCopy(&this_class, data->start_class,
2191 struct regnode_charclass_class);
2192 flags |= SCF_DO_STCLASS_OR;
2193 data->start_class->flags |= ANYOF_EOS;
2194 }
2195 } else { /* Non-zero len */
2196 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2197 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2198 cl_and(data->start_class, &and_with);
2199 }
2200 else if (flags & SCF_DO_STCLASS_AND)
2201 cl_and(data->start_class, &this_class);
2202 flags &= ~SCF_DO_STCLASS;
2203 }
c277df42
IZ
2204 if (!scan) /* It was not CURLYX, but CURLY. */
2205 scan = next;
041457d9
DM
2206 if ( /* ? quantifier ok, except for (?{ ... }) */
2207 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2208 && (minnext == 0) && (deltanext == 0)
99799961 2209 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2210 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2211 && ckWARN(WARN_REGEXP))
b45f050a 2212 {
830247a4 2213 vWARN(RExC_parse,
b45f050a
JF
2214 "Quantifier unexpected on zero-length expression");
2215 }
2216
c277df42 2217 min += minnext * mincount;
b81d288d 2218 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2219 && (minnext + deltanext) > 0)
2220 || deltanext == I32_MAX);
aca2d497 2221 is_inf |= is_inf_internal;
c277df42
IZ
2222 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2223
2224 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2225 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2226 && data->flags & SF_IN_PAR
2227 && !(data->flags & SF_HAS_EVAL)
2228 && !deltanext && minnext == 1 ) {
2229 /* Try to optimize to CURLYN. */
2230 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2231 regnode *nxt1 = nxt;
2232#ifdef DEBUGGING
2233 regnode *nxt2;
2234#endif
c277df42
IZ
2235
2236 /* Skip open. */
2237 nxt = regnext(nxt);
bfed75c6 2238 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2239 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2240 && STR_LEN(nxt) == 1))
c277df42 2241 goto nogo;
497b47a8 2242#ifdef DEBUGGING
c277df42 2243 nxt2 = nxt;
497b47a8 2244#endif
c277df42 2245 nxt = regnext(nxt);
b81d288d 2246 if (OP(nxt) != CLOSE)
c277df42
IZ
2247 goto nogo;
2248 /* Now we know that nxt2 is the only contents: */
eb160463 2249 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2250 OP(oscan) = CURLYN;
2251 OP(nxt1) = NOTHING; /* was OPEN. */
2252#ifdef DEBUGGING
2253 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2254 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2255 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2256 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2257 OP(nxt + 1) = OPTIMIZED; /* was count. */
2258 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2259#endif
c277df42 2260 }
c277df42
IZ
2261 nogo:
2262
2263 /* Try optimization CURLYX => CURLYM. */
b81d288d 2264 if ( OP(oscan) == CURLYX && data
c277df42 2265 && !(data->flags & SF_HAS_PAR)
c277df42 2266 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2267 && !deltanext /* atom is fixed width */
2268 && minnext != 0 /* CURLYM can't handle zero width */
2269 ) {
c277df42
IZ
2270 /* XXXX How to optimize if data == 0? */
2271 /* Optimize to a simpler form. */
2272 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2273 regnode *nxt2;
2274
2275 OP(oscan) = CURLYM;
2276 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2277 && (OP(nxt2) != WHILEM))
c277df42
IZ
2278 nxt = nxt2;
2279 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2280 /* Need to optimize away parenths. */
2281 if (data->flags & SF_IN_PAR) {
2282 /* Set the parenth number. */
2283 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2284
b81d288d 2285 if (OP(nxt) != CLOSE)
b45f050a 2286 FAIL("Panic opt close");
eb160463 2287 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2288 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2289 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2290#ifdef DEBUGGING
2291 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2292 OP(nxt + 1) = OPTIMIZED; /* was count. */
2293 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2294 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2295#endif
c277df42
IZ
2296#if 0
2297 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2298 regnode *nnxt = regnext(nxt1);
b81d288d 2299
c277df42
IZ
2300 if (nnxt == nxt) {
2301 if (reg_off_by_arg[OP(nxt1)])
2302 ARG_SET(nxt1, nxt2 - nxt1);
2303 else if (nxt2 - nxt1 < U16_MAX)
2304 NEXT_OFF(nxt1) = nxt2 - nxt1;
2305 else
2306 OP(nxt) = NOTHING; /* Cannot beautify */
2307 }
2308 nxt1 = nnxt;
2309 }
2310#endif
2311 /* Optimize again: */
b81d288d 2312 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2313 NULL, 0,depth+1);
a0ed51b3
LW
2314 }
2315 else
c277df42 2316 oscan->flags = 0;
c277df42 2317 }
e1901655
IZ
2318 else if ((OP(oscan) == CURLYX)
2319 && (flags & SCF_WHILEM_VISITED_POS)
2320 /* See the comment on a similar expression above.
2321 However, this time it not a subexpression
2322 we care about, but the expression itself. */
2323 && (maxcount == REG_INFTY)
2324 && data && ++data->whilem_c < 16) {
2325 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2326 /* Find WHILEM (as in regexec.c) */
2327 regnode *nxt = oscan + NEXT_OFF(oscan);
2328
2329 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2330 nxt += ARG(nxt);
eb160463
GS
2331 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2332 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2333 }
b81d288d 2334 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2335 pars++;
2336 if (flags & SCF_DO_SUBSTR) {
2337 SV *last_str = Nullsv;
2338 int counted = mincount != 0;
2339
2340 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2341#if defined(SPARC64_GCC_WORKAROUND)
2342 I32 b = 0;
2343 STRLEN l = 0;
cfd0369c 2344 const char *s = NULL;
5d1c421c
JH
2345 I32 old = 0;
2346
2347 if (pos_before >= data->last_start_min)
2348 b = pos_before;
2349 else
2350 b = data->last_start_min;
2351
2352 l = 0;
cfd0369c 2353 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2354 old = b - data->last_start_min;
2355
2356#else
b81d288d 2357 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2358 ? pos_before : data->last_start_min;
2359 STRLEN l;
cfd0369c 2360 const char *s = SvPV_const(data->last_found, l);
a0ed51b3 2361 I32 old = b - data->last_start_min;
5d1c421c 2362#endif
a0ed51b3
LW
2363
2364 if (UTF)
2365 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2366
a0ed51b3 2367 l -= old;
c277df42 2368 /* Get the added string: */
79cb57f6 2369 last_str = newSVpvn(s + old, l);
0e933229
IH
2370 if (UTF)
2371 SvUTF8_on(last_str);
c277df42
IZ
2372 if (deltanext == 0 && pos_before == b) {
2373 /* What was added is a constant string */
2374 if (mincount > 1) {
2375 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2376 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2377 SvPVX_const(last_str), l, mincount - 1);
b162af07 2378 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2379 /* Add additional parts. */
b81d288d 2380 SvCUR_set(data->last_found,
c277df42
IZ
2381 SvCUR(data->last_found) - l);
2382 sv_catsv(data->last_found, last_str);
0eda9292
JH
2383 {
2384 SV * sv = data->last_found;
2385 MAGIC *mg =
2386 SvUTF8(sv) && SvMAGICAL(sv) ?
2387 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2388 if (mg && mg->mg_len >= 0)
2389 mg->mg_len += CHR_SVLEN(last_str);
2390 }
c277df42
IZ
2391 data->last_end += l * (mincount - 1);
2392 }
2a8d9689
HS
2393 } else {
2394 /* start offset must point into the last copy */
2395 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2396 data->last_start_max += is_inf ? I32_MAX
2397 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2398 }
2399 }
2400 /* It is counted once already... */
2401 data->pos_min += minnext * (mincount - counted);
2402 data->pos_delta += - counted * deltanext +
2403 (minnext + deltanext) * maxcount - minnext * mincount;
2404 if (mincount != maxcount) {
653099ff
GS
2405 /* Cannot extend fixed substrings found inside
2406 the group. */
830247a4 2407 scan_commit(pRExC_state,data);
c277df42
IZ
2408 if (mincount && last_str) {
2409 sv_setsv(data->last_found, last_str);
2410 data->last_end = data->pos_min;
b81d288d 2411 data->last_start_min =
a0ed51b3 2412 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2413 data->last_start_max = is_inf
2414 ? I32_MAX
c277df42 2415 : data->pos_min + data->pos_delta
a0ed51b3 2416 - CHR_SVLEN(last_str);
c277df42
IZ
2417 }
2418 data->longest = &(data->longest_float);
2419 }
aca2d497 2420 SvREFCNT_dec(last_str);
c277df42 2421 }
405ff068 2422 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2423 data->flags |= SF_HAS_EVAL;
2424 optimize_curly_tail:
c277df42 2425 if (OP(oscan) != CURLYX) {
22c35a8c 2426 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2427 && NEXT_OFF(next))
2428 NEXT_OFF(oscan) += NEXT_OFF(next);
2429 }
c277df42 2430 continue;
653099ff 2431 default: /* REF and CLUMP only? */
c277df42 2432 if (flags & SCF_DO_SUBSTR) {
830247a4 2433 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2434 data->longest = &(data->longest_float);
2435 }
aca2d497 2436 is_inf = is_inf_internal = 1;
653099ff 2437 if (flags & SCF_DO_STCLASS_OR)
830247a4 2438 cl_anything(pRExC_state, data->start_class);
653099ff 2439 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2440 break;
2441 }
a0ed51b3 2442 }
bfed75c6 2443 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2444 int value = 0;
653099ff 2445
c277df42 2446 if (flags & SCF_DO_SUBSTR) {
830247a4 2447 scan_commit(pRExC_state,data);
c277df42
IZ
2448 data->pos_min++;
2449 }
2450 min++;
653099ff
GS
2451 if (flags & SCF_DO_STCLASS) {
2452 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2453
2454 /* Some of the logic below assumes that switching
2455 locale on will only add false positives. */
2456 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2457 case SANY:
653099ff
GS
2458 default:
2459 do_default:
2460 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2461 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2462 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2463 break;
2464 case REG_ANY:
2465 if (OP(scan) == SANY)
2466 goto do_default;
2467 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2468 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2469 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2470 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2471 }
2472 if (flags & SCF_DO_STCLASS_AND || !value)
2473 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2474 break;
2475 case ANYOF:
2476 if (flags & SCF_DO_STCLASS_AND)
2477 cl_and(data->start_class,
2478 (struct regnode_charclass_class*)scan);
2479 else
830247a4 2480 cl_or(pRExC_state, data->start_class,
653099ff
GS
2481 (struct regnode_charclass_class*)scan);
2482 break;
2483 case ALNUM:
2484 if (flags & SCF_DO_STCLASS_AND) {
2485 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2487 for (value = 0; value < 256; value++)
2488 if (!isALNUM(value))
2489 ANYOF_BITMAP_CLEAR(data->start_class, value);
2490 }
2491 }
2492 else {
2493 if (data->start_class->flags & ANYOF_LOCALE)
2494 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2495 else {
2496 for (value = 0; value < 256; value++)
2497 if (isALNUM(value))
b81d288d 2498 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2499 }
2500 }
2501 break;
2502 case ALNUML:
2503 if (flags & SCF_DO_STCLASS_AND) {
2504 if (data->start_class->flags & ANYOF_LOCALE)
2505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2506 }
2507 else {
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2509 data->start_class->flags |= ANYOF_LOCALE;
2510 }
2511 break;
2512 case NALNUM:
2513 if (flags & SCF_DO_STCLASS_AND) {
2514 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2516 for (value = 0; value < 256; value++)
2517 if (isALNUM(value))
2518 ANYOF_BITMAP_CLEAR(data->start_class, value);
2519 }
2520 }
2521 else {
2522 if (data->start_class->flags & ANYOF_LOCALE)
2523 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2524 else {
2525 for (value = 0; value < 256; value++)
2526 if (!isALNUM(value))
b81d288d 2527 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2528 }
2529 }
2530 break;
2531 case NALNUML:
2532 if (flags & SCF_DO_STCLASS_AND) {
2533 if (data->start_class->flags & ANYOF_LOCALE)
2534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2535 }
2536 else {
2537 data->start_class->flags |= ANYOF_LOCALE;
2538 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2539 }
2540 break;
2541 case SPACE:
2542 if (flags & SCF_DO_STCLASS_AND) {
2543 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2545 for (value = 0; value < 256; value++)
2546 if (!isSPACE(value))
2547 ANYOF_BITMAP_CLEAR(data->start_class, value);
2548 }
2549 }
2550 else {
2551 if (data->start_class->flags & ANYOF_LOCALE)
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2553 else {
2554 for (value = 0; value < 256; value++)
2555 if (isSPACE(value))
b81d288d 2556 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2557 }
2558 }
2559 break;
2560 case SPACEL:
2561 if (flags & SCF_DO_STCLASS_AND) {
2562 if (data->start_class->flags & ANYOF_LOCALE)
2563 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2564 }
2565 else {
2566 data->start_class->flags |= ANYOF_LOCALE;
2567 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2568 }
2569 break;
2570 case NSPACE:
2571 if (flags & SCF_DO_STCLASS_AND) {
2572 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2574 for (value = 0; value < 256; value++)
2575 if (isSPACE(value))
2576 ANYOF_BITMAP_CLEAR(data->start_class, value);
2577 }
2578 }
2579 else {
2580 if (data->start_class->flags & ANYOF_LOCALE)
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2582 else {
2583 for (value = 0; value < 256; value++)
2584 if (!isSPACE(value))
b81d288d 2585 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2586 }
2587 }
2588 break;
2589 case NSPACEL:
2590 if (flags & SCF_DO_STCLASS_AND) {
2591 if (data->start_class->flags & ANYOF_LOCALE) {
2592 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2593 for (value = 0; value < 256; value++)
2594 if (!isSPACE(value))
2595 ANYOF_BITMAP_CLEAR(data->start_class, value);
2596 }
2597 }
2598 else {
2599 data->start_class->flags |= ANYOF_LOCALE;
2600 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2601 }
2602 break;
2603 case DIGIT:
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2606 for (value = 0; value < 256; value++)
2607 if (!isDIGIT(value))
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2609 }
2610 else {
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2613 else {
2614 for (value = 0; value < 256; value++)
2615 if (isDIGIT(value))
b81d288d 2616 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2617 }
2618 }
2619 break;
2620 case NDIGIT:
2621 if (flags & SCF_DO_STCLASS_AND) {
2622 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2623 for (value = 0; value < 256; value++)
2624 if (isDIGIT(value))
2625 ANYOF_BITMAP_CLEAR(data->start_class, value);
2626 }
2627 else {
2628 if (data->start_class->flags & ANYOF_LOCALE)
2629 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2630 else {
2631 for (value = 0; value < 256; value++)
2632 if (!isDIGIT(value))
b81d288d 2633 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2634 }
2635 }
2636 break;
2637 }
2638 if (flags & SCF_DO_STCLASS_OR)
2639 cl_and(data->start_class, &and_with);
2640 flags &= ~SCF_DO_STCLASS;
2641 }
a0ed51b3 2642 }
22c35a8c 2643 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2644 data->flags |= (OP(scan) == MEOL
2645 ? SF_BEFORE_MEOL
2646 : SF_BEFORE_SEOL);
a0ed51b3 2647 }
653099ff
GS
2648 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2649 /* Lookbehind, or need to calculate parens/evals/stclass: */
2650 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2651 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2652 /* Lookahead/lookbehind */
cb434fcc 2653 I32 deltanext, minnext, fake = 0;
c277df42 2654 regnode *nscan;
653099ff
GS
2655 struct regnode_charclass_class intrnl;
2656 int f = 0;
c277df42
IZ
2657
2658 data_fake.flags = 0;
b81d288d 2659 if (data) {
2c2d71f5 2660 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2661 data_fake.last_closep = data->last_closep;
2662 }
2663 else
2664 data_fake.last_closep = &fake;
653099ff
GS
2665 if ( flags & SCF_DO_STCLASS && !scan->flags
2666 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2667 cl_init(pRExC_state, &intrnl);
653099ff 2668 data_fake.start_class = &intrnl;
e1901655 2669 f |= SCF_DO_STCLASS_AND;
653099ff 2670 }
e1901655
IZ
2671 if (flags & SCF_WHILEM_VISITED_POS)
2672 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2673 next = regnext(scan);
2674 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2675 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2676 if (scan->flags) {
2677 if (deltanext) {
9baa0206 2678 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2679 }
2680 else if (minnext > U8_MAX) {
9baa0206 2681 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2682 }
eb160463 2683 scan->flags = (U8)minnext;
c277df42
IZ
2684 }
2685 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2686 pars++;
405ff068 2687 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2688 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2689 if (data)
2690 data->whilem_c = data_fake.whilem_c;
e1901655 2691 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2692 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2693
2694 cl_and(data->start_class, &intrnl);
2695 if (was)
2696 data->start_class->flags |= ANYOF_EOS;
2697 }
a0ed51b3
LW
2698 }
2699 else if (OP(scan) == OPEN) {
c277df42 2700 pars++;
a0ed51b3 2701 }
cb434fcc 2702 else if (OP(scan) == CLOSE) {
eb160463 2703 if ((I32)ARG(scan) == is_par) {
cb434fcc 2704 next = regnext(scan);
c277df42 2705
cb434fcc
IZ
2706 if ( next && (OP(next) != WHILEM) && next < last)
2707 is_par = 0; /* Disable optimization */
2708 }
2709 if (data)
2710 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2711 }
2712 else if (OP(scan) == EVAL) {
c277df42
IZ
2713 if (data)
2714 data->flags |= SF_HAS_EVAL;
2715 }
96776eda 2716 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2717 if (flags & SCF_DO_SUBSTR) {
830247a4 2718 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2719 data->longest = &(data->longest_float);
2720 }
2721 is_inf = is_inf_internal = 1;
653099ff 2722 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2723 cl_anything(pRExC_state, data->start_class);
96776eda 2724 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2725 }
c277df42
IZ
2726 /* Else: zero-length, ignore. */
2727 scan = regnext(scan);
2728 }
2729
2730 finish:
2731 *scanp = scan;
aca2d497 2732 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2733 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2734 data->pos_delta = I32_MAX - data->pos_min;
2735 if (is_par > U8_MAX)
2736 is_par = 0;
2737 if (is_par && pars==1 && data) {
2738 data->flags |= SF_IN_PAR;
2739 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2740 }
2741 else if (pars && data) {
c277df42
IZ
2742 data->flags |= SF_HAS_PAR;
2743 data->flags &= ~SF_IN_PAR;
2744 }
653099ff
GS
2745 if (flags & SCF_DO_STCLASS_OR)
2746 cl_and(data->start_class, &and_with);
c277df42
IZ
2747 return min;
2748}
2749
76e3520e 2750STATIC I32
bfed75c6 2751S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2752{
830247a4 2753 if (RExC_rx->data) {
b81d288d
AB
2754 Renewc(RExC_rx->data,
2755 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2756 char, struct reg_data);
830247a4
IZ
2757 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2758 RExC_rx->data->count += n;
a0ed51b3
LW
2759 }
2760 else {
a02a5408 2761 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2762 char, struct reg_data);
a02a5408 2763 Newx(RExC_rx->data->what, n, U8);
830247a4 2764 RExC_rx->data->count = n;
c277df42 2765 }
830247a4
IZ
2766 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2767 return RExC_rx->data->count - n;
c277df42
IZ
2768}
2769
d88dccdf 2770void
864dbfa3 2771Perl_reginitcolors(pTHX)
d88dccdf 2772{
1df70142 2773 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2774 if (s) {
1df70142
AL
2775 char *t = savepv(s);
2776 int i = 0;
2777 PL_colors[0] = t;
d88dccdf 2778 while (++i < 6) {
1df70142
AL
2779 t = strchr(t, '\t');
2780 if (t) {
2781 *t = '\0';
2782 PL_colors[i] = ++t;
d88dccdf
IZ
2783 }
2784 else
1df70142 2785 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2786 }
2787 } else {
1df70142 2788 int i = 0;
b81d288d 2789 while (i < 6)
06b5626a 2790 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2791 }
2792 PL_colorset = 1;
2793}
2794
8615cb43 2795
a687059c 2796/*
e50aee73 2797 - pregcomp - compile a regular expression into internal code
a687059c
LW
2798 *
2799 * We can't allocate space until we know how big the compiled form will be,
2800 * but we can't compile it (and thus know how big it is) until we've got a
2801 * place to put the code. So we cheat: we compile it twice, once with code
2802 * generation turned off and size counting turned on, and once "for real".
2803 * This also means that we don't allocate space until we are sure that the
2804 * thing really will compile successfully, and we never have to move the
2805 * code and thus invalidate pointers into it. (Note that it has to be in
2806 * one piece because free() must be able to free it all.) [NB: not true in perl]
2807 *
2808 * Beware that the optimization-preparation code in here knows about some
2809 * of the structure of the compiled regexp. [I'll say.]
2810 */
2811regexp *
864dbfa3 2812Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2813{
a0d0e21e 2814 register regexp *r;
c277df42 2815 regnode *scan;
c277df42 2816 regnode *first;
a0d0e21e 2817 I32 flags;
a0d0e21e
LW
2818 I32 minlen = 0;
2819 I32 sawplus = 0;
2820 I32 sawopen = 0;
2c2d71f5 2821 scan_data_t data;
830247a4
IZ
2822 RExC_state_t RExC_state;
2823 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2824
a3621e74
YO
2825 GET_RE_DEBUG_FLAGS_DECL;
2826
a0d0e21e 2827 if (exp == NULL)
c277df42 2828 FAIL("NULL regexp argument");
a0d0e21e 2829
a5961de5 2830 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2831
5cfc7842 2832 RExC_precomp = exp;
a3621e74
YO
2833 DEBUG_r(if (!PL_colorset) reginitcolors());
2834 DEBUG_COMPILE_r({
2835 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2836 PL_colors[4],PL_colors[5],PL_colors[0],
2837 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2838 });
e2509266 2839 RExC_flags = pm->op_pmflags;
830247a4 2840 RExC_sawback = 0;
bbce6d69 2841
830247a4
IZ
2842 RExC_seen = 0;
2843 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2844 RExC_seen_evals = 0;
2845 RExC_extralen = 0;
c277df42 2846
bbce6d69 2847 /* First pass: determine size, legality. */
830247a4 2848 RExC_parse = exp;
fac92740 2849 RExC_start = exp;
830247a4
IZ
2850 RExC_end = xend;
2851 RExC_naughty = 0;
2852 RExC_npar = 1;
2853 RExC_size = 0L;
2854 RExC_emit = &PL_regdummy;
2855 RExC_whilem_seen = 0;
85ddcde9
JH
2856#if 0 /* REGC() is (currently) a NOP at the first pass.
2857 * Clever compilers notice this and complain. --jhi */
830247a4 2858 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2859#endif
830247a4 2860 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2861 RExC_precomp = Nullch;
a0d0e21e
LW
2862 return(NULL);
2863 }
a3621e74 2864 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2865
c277df42
IZ
2866 /* Small enough for pointer-storage convention?
2867 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2868 if (RExC_size >= 0x10000L && RExC_extralen)
2869 RExC_size += RExC_extralen;
c277df42 2870 else
830247a4
IZ
2871 RExC_extralen = 0;
2872 if (RExC_whilem_seen > 15)
2873 RExC_whilem_seen = 15;
a0d0e21e 2874
bbce6d69 2875 /* Allocate space and initialize. */
a02a5408 2876 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2877 char, regexp);
a0d0e21e 2878 if (r == NULL)
b45f050a
JF
2879 FAIL("Regexp out of space");
2880
0f79a09d
GS
2881#ifdef DEBUGGING
2882 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2883 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2884#endif
c277df42 2885 r->refcnt = 1;
bbce6d69 2886 r->prelen = xend - exp;
5cfc7842 2887 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2888 r->subbeg = NULL;
f8c7b90f 2889#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2890 r->saved_copy = Nullsv;
2891#endif
cf93c79d 2892 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2893 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2894
2895 r->substrs = 0; /* Useful during FAIL. */
2896 r->startp = 0; /* Useful during FAIL. */
2897 r->endp = 0; /* Useful during FAIL. */
2898
a02a5408 2899 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2900 if (r->offsets) {
2af232bd 2901 r->offsets[0] = RExC_size;
fac92740 2902 }
a3621e74 2903 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2904 "%s %"UVuf" bytes for offset annotations.\n",
2905 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2906 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2907
830247a4 2908 RExC_rx = r;
bbce6d69 2909
2910 /* Second pass: emit code. */
e2509266 2911 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2912 RExC_parse = exp;
2913 RExC_end = xend;
2914 RExC_naughty = 0;
2915 RExC_npar = 1;
fac92740 2916 RExC_emit_start = r->program;
830247a4 2917 RExC_emit = r->program;
2cd61cdb 2918 /* Store the count of eval-groups for security checks: */
eb160463 2919 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2920 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2921 r->data = 0;
830247a4 2922 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2923 return(NULL);
2924
a3621e74 2925
a0d0e21e 2926 /* Dig out information for optimizations. */
cf93c79d 2927 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2928 pm->op_pmflags = RExC_flags;
a0ed51b3 2929 if (UTF)
5ff6fc6d 2930 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2931 r->regstclass = NULL;
830247a4 2932 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2933 r->reganch |= ROPT_NAUGHTY;
c277df42 2934 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2935
2936 /* XXXX To minimize changes to RE engine we always allocate
2937 3-units-long substrs field. */
a02a5408 2938 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2939
2c2d71f5 2940 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2941 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2942 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2943 I32 fake;
c5254dd6 2944 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2945 struct regnode_charclass_class ch_class;
2946 int stclass_flag;
cb434fcc 2947 I32 last_close = 0;
a0d0e21e
LW
2948
2949 first = scan;
c277df42 2950 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2951 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2952 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2953 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2954 (OP(first) == PLUS) ||
2955 (OP(first) == MINMOD) ||
653099ff 2956 /* An {n,m} with n>0 */
22c35a8c 2957 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2958 if (OP(first) == PLUS)
2959 sawplus = 1;
2960 else
2961 first += regarglen[(U8)OP(first)];
2962 first = NEXTOPER(first);
a687059c
LW
2963 }
2964
a0d0e21e
LW
2965 /* Starting-point info. */
2966 again:
653099ff 2967 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2968 if (OP(first) == EXACT)
2969 ; /* Empty, get anchored substr later. */
2970 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2971 r->regstclass = first;
2972 }
bfed75c6 2973 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2974 r->regstclass = first;
22c35a8c
GS
2975 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2976 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2977 r->regstclass = first;
22c35a8c 2978 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2979 r->reganch |= (OP(first) == MBOL
2980 ? ROPT_ANCH_MBOL
2981 : (OP(first) == SBOL
2982 ? ROPT_ANCH_SBOL
2983 : ROPT_ANCH_BOL));
a0d0e21e 2984 first = NEXTOPER(first);
774d564b 2985 goto again;
2986 }
2987 else if (OP(first) == GPOS) {
2988 r->reganch |= ROPT_ANCH_GPOS;
2989 first = NEXTOPER(first);
2990 goto again;
a0d0e21e 2991 }
e09294f4 2992 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2993 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2994 !(r->reganch & ROPT_ANCH) )
2995 {
2996 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2997 const int type =
2998 (OP(NEXTOPER(first)) == REG_ANY)
2999 ? ROPT_ANCH_MBOL
3000 : ROPT_ANCH_SBOL;
cad2e5aa 3001 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3002 first = NEXTOPER(first);
774d564b 3003 goto again;
a0d0e21e 3004 }
b81d288d 3005 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3006 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3007 /* x+ must match at the 1st pos of run of x's */
3008 r->reganch |= ROPT_SKIP;
a0d0e21e 3009
c277df42 3010 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3011 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3012 (IV)(first - scan + 1)));
a0d0e21e
LW
3013 /*
3014 * If there's something expensive in the r.e., find the
3015 * longest literal string that must appear and make it the
3016 * regmust. Resolve ties in favor of later strings, since
3017 * the regstart check works with the beginning of the r.e.
3018 * and avoiding duplication strengthens checking. Not a
3019 * strong reason, but sufficient in the absence of others.
3020 * [Now we resolve ties in favor of the earlier string if
c277df42 3021 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3022 * earlier string may buy us something the later one won't.]
3023 */
a0d0e21e 3024 minlen = 0;
a687059c 3025
79cb57f6
GS
3026 data.longest_fixed = newSVpvn("",0);
3027 data.longest_float = newSVpvn("",0);
3028 data.last_found = newSVpvn("",0);
c277df42
IZ
3029 data.longest = &(data.longest_fixed);
3030 first = scan;
653099ff 3031 if (!r->regstclass) {
830247a4 3032 cl_init(pRExC_state, &ch_class);
653099ff
GS
3033 data.start_class = &ch_class;
3034 stclass_flag = SCF_DO_STCLASS_AND;
3035 } else /* XXXX Check for BOUND? */
3036 stclass_flag = 0;
cb434fcc 3037 data.last_closep = &last_close;
653099ff 3038
830247a4 3039 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3040 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3041 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3042 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3043 && !RExC_seen_zerolen
3044 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3045 r->reganch |= ROPT_CHECK_ALL;
830247a4 3046 scan_commit(pRExC_state, &data);
c277df42
IZ
3047 SvREFCNT_dec(data.last_found);
3048
a0ed51b3 3049 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3050 if (longest_float_length
c277df42
IZ
3051 || (data.flags & SF_FL_BEFORE_EOL
3052 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3053 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3054 int t;
3055
a0ed51b3 3056 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3057 && data.offset_fixed == data.offset_float_min
3058 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3059 goto remove_float; /* As in (a)+. */
3060
33b8afdf
JH
3061 if (SvUTF8(data.longest_float)) {
3062 r->float_utf8 = data.longest_float;
3063 r->float_substr = Nullsv;
3064 } else {
3065 r->float_substr = data.longest_float;
3066 r->float_utf8 = Nullsv;
3067 }
c277df42
IZ
3068 r->float_min_offset = data.offset_float_min;
3069 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3070 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3071 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3072 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3073 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3074 }
3075 else {
aca2d497 3076 remove_float:
33b8afdf 3077 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3078 SvREFCNT_dec(data.longest_float);
c5254dd6 3079 longest_float_length = 0;
a0d0e21e 3080 }
c277df42 3081
a0ed51b3 3082 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3083 if (longest_fixed_length
c277df42
IZ
3084 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3087 int t;
3088
33b8afdf
JH
3089 if (SvUTF8(data.longest_fixed)) {
3090 r->anchored_utf8 = data.longest_fixed;
3091 r->anchored_substr = Nullsv;
3092 } else {
3093 r->anchored_substr = data.longest_fixed;
3094 r->anchored_utf8 = Nullsv;
3095 }
c277df42 3096 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3097 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3099 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3100 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3101 }
3102 else {
33b8afdf 3103 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3104 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3105 longest_fixed_length = 0;
a0d0e21e 3106 }
b81d288d 3107 if (r->regstclass
ffc61ed2 3108 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3109 r->regstclass = NULL;
33b8afdf
JH
3110 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3111 && stclass_flag
653099ff 3112 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3113 && !cl_is_anything(data.start_class))
3114 {
1df70142 3115 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3116
a02a5408 3117 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3118 struct regnode_charclass_class);
3119 StructCopy(data.start_class,
830247a4 3120 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3121 struct regnode_charclass_class);
830247a4 3122 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3123 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3124 PL_regdata = r->data; /* for regprop() */
a3621e74 3125 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3126 regprop(sv, (regnode*)data.start_class);
3127 PerlIO_printf(Perl_debug_log,
a0288114 3128 "synthetic stclass \"%s\".\n",
3f7c398e 3129 SvPVX_const(sv));});
653099ff 3130 }
c277df42
IZ
3131
3132 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3133 if (longest_fixed_length > longest_float_length) {
c277df42 3134 r->check_substr = r->anchored_substr;
33b8afdf 3135 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3136 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3137 if (r->reganch & ROPT_ANCH_SINGLE)
3138 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3139 }
3140 else {
c277df42 3141 r->check_substr = r->float_substr;
33b8afdf 3142 r->check_utf8 = r->float_utf8;
c277df42
IZ
3143 r->check_offset_min = data.offset_float_min;
3144 r->check_offset_max = data.offset_float_max;
a0d0e21e 3145 }
30382c73
IZ
3146 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3147 This should be changed ASAP! */
33b8afdf 3148 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3149 r->reganch |= RE_USE_INTUIT;
33b8afdf 3150 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3151 r->reganch |= RE_INTUIT_TAIL;
3152 }
a0ed51b3
LW
3153 }
3154 else {
c277df42
IZ
3155 /* Several toplevels. Best we can is to set minlen. */
3156 I32 fake;
653099ff 3157 struct regnode_charclass_class ch_class;
cb434fcc 3158 I32 last_close = 0;
c277df42 3159
a3621e74 3160 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3161 scan = r->program + 1;
830247a4 3162 cl_init(pRExC_state, &ch_class);
653099ff 3163 data.start_class = &ch_class;
cb434fcc 3164 data.last_closep = &last_close;
a3621e74 3165 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3166 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3167 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3168 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3169 && !cl_is_anything(data.start_class))
3170 {
1df70142 3171 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3172
a02a5408 3173 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3174 struct regnode_charclass_class);
3175 StructCopy(data.start_class,
830247a4 3176 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3177 struct regnode_charclass_class);
830247a4 3178 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3179 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3180 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3181 regprop(sv, (regnode*)data.start_class);
3182 PerlIO_printf(Perl_debug_log,
a0288114 3183 "synthetic stclass \"%s\".\n",
3f7c398e 3184 SvPVX_const(sv));});
653099ff 3185 }
a0d0e21e
LW
3186 }
3187
a0d0e21e 3188 r->minlen = minlen;
b81d288d 3189 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3190 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3191 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3192 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3193 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3194 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3195 if (RExC_seen & REG_SEEN_CANY)
3196 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3197 Newxz(r->startp, RExC_npar, I32);
3198 Newxz(r->endp, RExC_npar, I32);
ffc61ed2 3199 PL_regdata = r->data; /* for regprop() */
a3621e74 3200 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3201 return(r);
a687059c
LW
3202}
3203
3204/*
3205 - reg - regular expression, i.e. main body or parenthesized thing
3206 *
3207 * Caller must absorb opening parenthesis.
3208 *
3209 * Combining parenthesis handling with the base level of regular expression
3210 * is a trifle forced, but the need to tie the tails of the branches to what
3211 * follows makes it hard to avoid.
3212 */
76e3520e 3213STATIC regnode *
830247a4 3214S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3215 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3216{
27da23d5 3217 dVAR;
c277df42
IZ
3218 register regnode *ret; /* Will be the head of the group. */
3219 register regnode *br;
3220 register regnode *lastbr;
3221 register regnode *ender = 0;
a0d0e21e 3222 register I32 parno = 0;
e2509266 3223 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3224
3225 /* for (?g), (?gc), and (?o) warnings; warning
3226 about (?c) will warn about (?g) -- japhy */
3227
3228 I32 wastedflags = 0x00,
3229 wasted_o = 0x01,
3230 wasted_g = 0x02,
3231 wasted_gc = 0x02 | 0x04,
3232 wasted_c = 0x04;
3233
fac92740 3234 char * parse_start = RExC_parse; /* MJD */
a28509cc 3235 char * const oregcomp_parse = RExC_parse;
c277df42 3236 char c;
a0d0e21e 3237
821b33a5 3238 *flagp = 0; /* Tentatively. */
a0d0e21e 3239
9d1d55b5 3240
a0d0e21e
LW
3241 /* Make an OPEN node, if parenthesized. */
3242 if (paren) {
fac92740 3243 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3244 U32 posflags = 0, negflags = 0;
3245 U32 *flagsp = &posflags;
0f5d15d6 3246 int logical = 0;
a28509cc 3247 const char * const seqstart = RExC_parse;
ca9dfc88 3248
830247a4
IZ
3249 RExC_parse++;
3250 paren = *RExC_parse++;
c277df42 3251 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3252 switch (paren) {
fac92740 3253 case '<': /* (?<...) */
830247a4 3254 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3255 if (*RExC_parse == '!')
c277df42 3256 paren = ',';
b81d288d 3257 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3258 goto unknown;
830247a4 3259 RExC_parse++;
fac92740
MJD
3260 case '=': /* (?=...) */
3261 case '!': /* (?!...) */
830247a4 3262 RExC_seen_zerolen++;
fac92740
MJD
3263 case ':': /* (?:...) */
3264 case '>': /* (?>...) */
a0d0e21e 3265 break;
fac92740
MJD
3266 case '$': /* (?$...) */
3267 case '@': /* (?@...) */
8615cb43 3268 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3269 break;
fac92740 3270 case '#': /* (?#...) */
830247a4
IZ
3271 while (*RExC_parse && *RExC_parse != ')')
3272 RExC_parse++;
3273 if (*RExC_parse != ')')
c277df42 3274 FAIL("Sequence (?#... not terminated");
830247a4 3275 nextchar(pRExC_state);
a0d0e21e
LW
3276 *flagp = TRYAGAIN;
3277 return NULL;
fac92740 3278 case 'p': /* (?p...) */
9014280d 3279 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3280 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3281 /* FALL THROUGH*/
fac92740 3282 case '?': /* (??...) */
0f5d15d6 3283 logical = 1;
438a3801
YST
3284 if (*RExC_parse != '{')
3285 goto unknown;
830247a4 3286 paren = *RExC_parse++;
0f5d15d6 3287 /* FALL THROUGH */
fac92740 3288 case '{': /* (?{...}) */
c277df42 3289 {
c277df42
IZ
3290 I32 count = 1, n = 0;
3291 char c;
830247a4 3292 char *s = RExC_parse;
c277df42
IZ
3293 SV *sv;
3294 OP_4tree *sop, *rop;
3295
830247a4
IZ
3296 RExC_seen_zerolen++;
3297 RExC_seen |= REG_SEEN_EVAL;
3298 while (count && (c = *RExC_parse)) {
3299 if (c == '\\' && RExC_parse[1])
3300 RExC_parse++;
b81d288d 3301 else if (c == '{')
c277df42 3302 count++;
b81d288d 3303 else if (c == '}')
c277df42 3304 count--;
830247a4 3305 RExC_parse++;
c277df42 3306 }
830247a4 3307 if (*RExC_parse != ')')
b45f050a 3308 {
b81d288d 3309 RExC_parse = s;
b45f050a
JF
3310 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3311 }
c277df42 3312 if (!SIZE_ONLY) {
f3548bdc 3313 PAD *pad;
b81d288d
AB
3314
3315 if (RExC_parse - 1 - s)
830247a4 3316 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3317 else
79cb57f6 3318 sv = newSVpvn("", 0);
c277df42 3319
569233ed
SB
3320 ENTER;
3321 Perl_save_re_context(aTHX);
f3548bdc 3322 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3323 sop->op_private |= OPpREFCOUNTED;
3324 /* re_dup will OpREFCNT_inc */
3325 OpREFCNT_set(sop, 1);
569233ed 3326 LEAVE;
c277df42 3327
830247a4
IZ
3328 n = add_data(pRExC_state, 3, "nop");
3329 RExC_rx->data->data[n] = (void*)rop;
3330 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3331 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3332 SvREFCNT_dec(sv);
a0ed51b3 3333 }
e24b16f9 3334 else { /* First pass */
830247a4 3335 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3336 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3337 /* No compiled RE interpolated, has runtime
3338 components ===> unsafe. */
3339 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3340 if (PL_tainting && PL_tainted)
cc6b7395 3341 FAIL("Eval-group in insecure regular expression");
923e4eb5 3342 if (IN_PERL_COMPILETIME)
b5c19bd7 3343 PL_cv_has_eval = 1;
c277df42 3344 }
b5c19bd7 3345
830247a4 3346 nextchar(pRExC_state);
0f5d15d6 3347 if (logical) {
830247a4 3348 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3349 if (!SIZE_ONLY)
3350 ret->flags = 2;
830247a4 3351 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3352 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3353 return ret;
3354 }
ccb2c380
MP
3355 ret = reganode(pRExC_state, EVAL, n);
3356 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3357 Set_Node_Offset(ret, parse_start);
3358 return ret;
c277df42 3359 }
fac92740 3360 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3361 {
fac92740 3362 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3363 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3364 || RExC_parse[1] == '<'
830247a4 3365 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3366 I32 flag;
3367
830247a4 3368 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3369 if (!SIZE_ONLY)
3370 ret->flags = 1;
830247a4 3371 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3372 goto insert_if;
b81d288d 3373 }
a0ed51b3 3374 }
830247a4 3375 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3376 /* (?(1)...) */
830247a4 3377 parno = atoi(RExC_parse++);
c277df42 3378
830247a4
IZ
3379 while (isDIGIT(*RExC_parse))
3380 RExC_parse++;
fac92740 3381 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3382
830247a4 3383 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3384 vFAIL("Switch condition not recognized");
c277df42 3385 insert_if:
830247a4
IZ
3386 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3387 br = regbranch(pRExC_state, &flags, 1);
c277df42 3388 if (br == NULL)
830247a4 3389 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3390 else
830247a4
IZ
3391 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3392 c = *nextchar(pRExC_state);
d1b80229
IZ
3393 if (flags&HASWIDTH)
3394 *flagp |= HASWIDTH;
c277df42 3395 if (c == '|') {
830247a4
IZ
3396 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3397 regbranch(pRExC_state, &flags, 1);
3398 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3399 if (flags&HASWIDTH)
3400 *flagp |= HASWIDTH;
830247a4 3401 c = *nextchar(pRExC_state);
a0ed51b3
LW
3402 }
3403 else
c277df42
IZ
3404 lastbr = NULL;
3405 if (c != ')')
8615cb43 3406 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3407 ender = reg_node(pRExC_state, TAIL);
3408 regtail(pRExC_state, br, ender);
c277df42 3409 if (lastbr) {
830247a4
IZ
3410 regtail(pRExC_state, lastbr, ender);
3411 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3412 }
3413 else
830247a4 3414 regtail(pRExC_state, ret, ender);
c277df42 3415 return ret;
a0ed51b3
LW
3416 }
3417 else {
830247a4 3418 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3419 }
3420 }
1b1626e4 3421 case 0:
830247a4 3422 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3423 vFAIL("Sequence (? incomplete");
1b1626e4 3424 break;
a0d0e21e 3425 default:
830247a4 3426 --RExC_parse;
fac92740 3427 parse_flags: /* (?i) */
830247a4 3428 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3429 /* (?g), (?gc) and (?o) are useless here
3430 and must be globally applied -- japhy */
3431
3432 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3434 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3435 if (! (wastedflags & wflagbit) ) {
3436 wastedflags |= wflagbit;
3437 vWARN5(
3438 RExC_parse + 1,
3439 "Useless (%s%c) - %suse /%c modifier",
3440 flagsp == &negflags ? "?-" : "?",
3441 *RExC_parse,
3442 flagsp == &negflags ? "don't " : "",
3443 *RExC_parse
3444 );
3445 }
3446 }
3447 }
3448 else if (*RExC_parse == 'c') {
3449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3450 if (! (wastedflags & wasted_c) ) {
3451 wastedflags |= wasted_gc;
3452 vWARN3(
3453 RExC_parse + 1,
3454 "Useless (%sc) - %suse /gc modifier",
3455 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : ""
3457 );
3458 }
3459 }
3460 }
3461 else { pmflag(flagsp, *RExC_parse); }
3462
830247a4 3463 ++RExC_parse;
ca9dfc88 3464 }
830247a4 3465 if (*RExC_parse == '-') {
ca9dfc88 3466 flagsp = &negflags;
9d1d55b5 3467 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3468 ++RExC_parse;
ca9dfc88 3469 goto parse_flags;
48c036b1 3470 }
e2509266
JH
3471 RExC_flags |= posflags;
3472 RExC_flags &= ~negflags;
830247a4
IZ
3473 if (*RExC_parse == ':') {
3474 RExC_parse++;
ca9dfc88
IZ
3475 paren = ':';
3476 break;
3477 }
c277df42 3478 unknown:
830247a4
IZ
3479 if (*RExC_parse != ')') {
3480 RExC_parse++;
3481 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3482 }
830247a4 3483 nextchar(pRExC_state);
a0d0e21e
LW
3484 *flagp = TRYAGAIN;
3485 return NULL;
3486 }
3487 }
fac92740 3488 else { /* (...) */
830247a4
IZ
3489 parno = RExC_npar;
3490 RExC_npar++;
3491 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3492 Set_Node_Length(ret, 1); /* MJD */
3493 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3494 open = 1;
a0d0e21e 3495 }
a0ed51b3 3496 }
fac92740 3497 else /* ! paren */
a0d0e21e
LW
3498 ret = NULL;
3499
3500 /* Pick up the branches, linking them together. */
fac92740 3501 parse_start = RExC_parse; /* MJD */
830247a4 3502 br = regbranch(pRExC_state, &flags, 1);
fac92740 3503 /* branch_len = (paren != 0); */
2af232bd 3504
a0d0e21e
LW
3505 if (br == NULL)
3506 return(NULL);
830247a4
IZ
3507 if (*RExC_parse == '|') {
3508 if (!SIZE_ONLY && RExC_extralen) {
3509 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3510 }
fac92740 3511 else { /* MJD */
830247a4 3512 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3513 Set_Node_Length(br, paren != 0);
3514 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3515 }
c277df42
IZ
3516 have_branch = 1;
3517 if (SIZE_ONLY)
830247a4 3518 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3519 }
3520 else if (paren == ':') {
c277df42
IZ
3521 *flagp |= flags&SIMPLE;
3522 }
3523 if (open) { /* Starts with OPEN. */
830247a4 3524 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3525 }
3526 else if (paren != '?') /* Not Conditional */
a0d0e21e 3527 ret = br;
32a0ca98 3528 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3529 lastbr = br;
830247a4
IZ
3530 while (*RExC_parse == '|') {
3531 if (!SIZE_ONLY && RExC_extralen) {
3532 ender = reganode(pRExC_state, LONGJMP,0);
3533 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3534 }
3535 if (SIZE_ONLY)
830247a4
IZ
3536 RExC_extralen += 2; /* Account for LONGJMP. */
3537 nextchar(pRExC_state);
3538 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3539
a687059c 3540 if (br == NULL)
a0d0e21e 3541 return(NULL);
830247a4 3542 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3543 lastbr = br;
821b33a5
IZ
3544 if (flags&HASWIDTH)
3545 *flagp |= HASWIDTH;
a687059c 3546 *flagp |= flags&SPSTART;
a0d0e21e
LW
3547 }
3548
c277df42
IZ
3549 if (have_branch || paren != ':') {
3550 /* Make a closing node, and hook it on the end. */
3551 switch (paren) {
3552 case ':':
830247a4 3553 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3554 break;
3555 case 1:
830247a4 3556 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3557 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3558 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3559 break;
3560 case '<':
c277df42
IZ
3561 case ',':
3562 case '=':
3563 case '!':
c277df42 3564 *flagp &= ~HASWIDTH;
821b33a5
IZ
3565 /* FALL THROUGH */
3566 case '>':
830247a4 3567 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3568 break;
3569 case 0:
830247a4 3570 ender = reg_node(pRExC_state, END);
c277df42
IZ
3571 break;
3572 }
830247a4 3573 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3574
c277df42
IZ
3575 if (have_branch) {
3576 /* Hook the tails of the branches to the closing node. */
3577 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3578 regoptail(pRExC_state, br, ender);
c277df42
IZ
3579 }
3580 }
a0d0e21e 3581 }
c277df42
IZ
3582
3583 {
e1ec3a88
AL
3584 const char *p;
3585 static const char parens[] = "=!<,>";
c277df42
IZ
3586
3587 if (paren && (p = strchr(parens, paren))) {
eb160463 3588 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3589 int flag = (p - parens) > 1;
3590
3591 if (paren == '>')
3592 node = SUSPEND, flag = 0;
830247a4 3593 reginsert(pRExC_state, node,ret);
45948336
EP
3594 Set_Node_Cur_Length(ret);
3595 Set_Node_Offset(ret, parse_start + 1);
c277df42 3596 ret->flags = flag;
830247a4 3597 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3598 }
a0d0e21e
LW
3599 }
3600
3601 /* Check for proper termination. */
ce3e6498 3602 if (paren) {
e2509266 3603 RExC_flags = oregflags;
830247a4
IZ
3604 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3605 RExC_parse = oregcomp_parse;