This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
More SvPV consting
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
27da23d5
JH
209static const scan_data_t zero_scan_data =
210 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380 270#define FAIL(msg) STMT_START { \
bfed75c6 271 const char *ellipses = ""; \
ccb2c380
MP
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380 290#define FAIL2(pat,msg) STMT_START { \
bfed75c6 291 const char *ellipses = ""; \
ccb2c380
MP
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
a3621e74 431/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
e1ec3a88
AL
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c 663/*
a3621e74
YO
664
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
673
674Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
675
676A trie is an N'ary tree where the branches are determined by digital
677decomposition of the key. IE, at the root node you look up the 1st character and
678follow that branch repeat until you find the end of the branches. Nodes can be
679marked as "accepting" meaning they represent a complete word. Eg:
680
681 /he|she|his|hers/
682
683would convert into the following structure. Numbers represent states, letters
684following numbers represent valid transitions on the letter from that state, if
685the number is in square brackets it represents an accepting state, otherwise it
686will be in parenthesis.
687
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
689 | |
690 | (2)
691 | |
692 (1) +-i->(6)-+-s->[7]
693 |
694 +-s->(3)-+-h->(4)-+-e->[5]
695
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
697
698This shows that when matching against the string 'hers' we will begin at state 1
699read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702single traverse. We store a mapping from accepting to state to which word was
703matched, and then when we have multiple possibilities we try to complete the
704rest of the regex in the order in which they occured in the alternation.
705
706The only prior NFA like behaviour that would be changed by the TRIE support is
707the silent ignoring of duplicate alternations which are of the form:
708
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
710
711Thus EVAL blocks follwing a trie may be called a different number of times with
712and without the optimisation. With the optimisations dupes will be silently
713ignored. This inconsistant behaviour of EVAL type nodes is well established as
714the following demonstrates:
715
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
717
718which prints out 'word' three times, but
719
720 'words'=~/(word|word|word)(?{ print $1 })S/
721
722which doesnt print it out at all. This is due to other optimisations kicking in.
723
724Example of what happens on a structural level:
725
726The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
727
728 1: CURLYM[1] {1,32767}(18)
729 5: BRANCH(8)
730 6: EXACT <ac>(16)
731 8: BRANCH(11)
732 9: EXACT <ad>(16)
733 11: BRANCH(14)
734 12: EXACT <ab>(16)
735 16: SUCCEED(0)
736 17: NOTHING(18)
737 18: END(0)
738
739This would be optimizable with startbranch=5, first=5, last=16, tail=16
740and should turn into:
741
742 1: CURLYM[1] {1,32767}(18)
743 5: TRIE(16)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
745 <ac>
746 <ad>
747 <ab>
748 16: SUCCEED(0)
749 17: NOTHING(18)
750 18: END(0)
751
752Cases where tail != last would be like /(?foo|bar)baz/:
753
754 1: BRANCH(4)
755 2: EXACT <foo>(8)
756 4: BRANCH(7)
757 5: EXACT <bar>(8)
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762which would be optimizable with startbranch=1, first=1, last=7, tail=8
763and would end up looking like:
764
765 1: TRIE(8)
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
767 <foo>
768 <bar>
769 7: TAIL(8)
770 8: EXACT <baz>(10)
771 10: END(0)
772
773*/
774
775#define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
777 SV *tmp; \
778 if ( UTF ) { \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
781 } else { \
e4584336 782 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
783 } \
784 av_push( trie->revcharmap, tmp ); \
785 })
786
787#define TRIE_READ_CHAR STMT_START { \
788 if ( UTF ) { \
789 if ( folder ) { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
792 foldlen -= len; \
793 scan += len; \
794 len = 0; \
795 } else { \
e1ec3a88 796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
800 } \
801 } else { \
e1ec3a88 802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
803 } \
804 } else { \
805 uvc = (U32)*uc; \
806 len = 1; \
807 } \
808} STMT_END
809
810
811#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
815
816#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
821 } \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
825} STMT_END
826
827#define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
832} STMT_END
833
834STATIC I32
835S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
836{
27da23d5 837 dVAR;
a3621e74
YO
838 /* first pass, loop through and scan words */
839 reg_trie_data *trie;
840 regnode *cur;
e1ec3a88 841 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
842 STRLEN len = 0;
843 UV uvc = 0;
844 U16 curword = 0;
845 U32 next_alloc = 0;
846 /* we just use folder as a flag in utf8 */
e1ec3a88 847 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
848 ? PL_fold
849 : ( flags == EXACTFL
850 ? PL_fold_locale
851 : NULL
852 )
853 );
854
e1ec3a88 855 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
856 SV *re_trie_maxbuff;
857
858 GET_RE_DEBUG_FLAGS_DECL;
859
860 Newz( 848200, trie, 1, reg_trie_data );
861 trie->refcount = 1;
862 RExC_rx->data->data[ data_slot ] = (void*)trie;
863 Newz( 848201, trie->charmap, 256, U16 );
864 DEBUG_r({
865 trie->words = newAV();
866 trie->revcharmap = newAV();
867 });
868
869
0111c4fd 870 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 871 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 872 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
873 }
874
875 /* -- First loop and Setup --
876
877 We first traverse the branches and scan each word to determine if it
878 contains widechars, and how many unique chars there are, this is
879 important as we have to build a table with at least as many columns as we
880 have unique chars.
881
882 We use an array of integers to represent the character codes 0..255
883 (trie->charmap) and we use a an HV* to store unicode characters. We use the
884 native representation of the character value as the key and IV's for the
885 coded index.
886
887 *TODO* If we keep track of how many times each character is used we can
888 remap the columns so that the table compression later on is more
889 efficient in terms of memory by ensuring most common value is in the
890 middle and the least common are on the outside. IMO this would be better
891 than a most to least common mapping as theres a decent chance the most
892 common letter will share a node with the least common, meaning the node
893 will not be compressable. With a middle is most common approach the worst
894 case is when we have the least common nodes twice.
895
896 */
897
898
899 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
900 regnode *noper = NEXTOPER( cur );
e1ec3a88
AL
901 const U8 *uc = (U8*)STRING( noper );
902 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
903 STRLEN foldlen = 0;
904 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 905 const U8 *scan = (U8*)NULL;
a3621e74
YO
906
907 for ( ; uc < e ; uc += len ) {
908 trie->charcount++;
909 TRIE_READ_CHAR;
910 if ( uvc < 256 ) {
911 if ( !trie->charmap[ uvc ] ) {
912 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
913 if ( folder )
914 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
915 TRIE_DEBUG_CHAR;
916 }
917 } else {
918 SV** svpp;
919 if ( !trie->widecharmap )
920 trie->widecharmap = newHV();
921
922 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
923
924 if ( !svpp )
e4584336 925 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
926
927 if ( !SvTRUE( *svpp ) ) {
928 sv_setiv( *svpp, ++trie->uniquecharcount );
929 TRIE_DEBUG_CHAR;
930 }
931 }
932 }
933 trie->wordcount++;
934 } /* end first pass */
935 DEBUG_TRIE_COMPILE_r(
936 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
937 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 938 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
939 );
940
941
942 /*
943 We now know what we are dealing with in terms of unique chars and
944 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
945 representation using a flat table will take. If it's over a reasonable
946 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
947 conservative but potentially much slower representation using an array
948 of lists.
949
950 At the end we convert both representations into the same compressed
951 form that will be used in regexec.c for matching with. The latter
952 is a form that cannot be used to construct with but has memory
953 properties similar to the list form and access properties similar
954 to the table form making it both suitable for fast searches and
955 small enough that its feasable to store for the duration of a program.
956
957 See the comment in the code where the compressed table is produced
958 inplace from the flat tabe representation for an explanation of how
959 the compression works.
960
961 */
962
963
964 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
965 /*
966 Second Pass -- Array Of Lists Representation
967
968 Each state will be represented by a list of charid:state records
969 (reg_trie_trans_le) the first such element holds the CUR and LEN
970 points of the allocated array. (See defines above).
971
972 We build the initial structure using the lists, and then convert
973 it into the compressed table form which allows faster lookups
974 (but cant be modified once converted).
975
976
977 */
978
979
980 STRLEN transcount = 1;
981
982 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
983 TRIE_LIST_NEW(1);
984 next_alloc = 2;
985
986 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
987
988 regnode *noper = NEXTOPER( cur );
989 U8 *uc = (U8*)STRING( noper );
990 U8 *e = uc + STR_LEN( noper );
991 U32 state = 1; /* required init */
992 U16 charid = 0; /* sanity init */
993 U8 *scan = (U8*)NULL; /* sanity init */
994 STRLEN foldlen = 0; /* required init */
995 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
996
997
998 for ( ; uc < e ; uc += len ) {
999
1000 TRIE_READ_CHAR;
1001
1002 if ( uvc < 256 ) {
1003 charid = trie->charmap[ uvc ];
1004 } else {
1005 SV** svpp=(SV**)NULL;
1006 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1007 if ( !svpp ) {
1008 charid = 0;
1009 } else {
1010 charid=(U16)SvIV( *svpp );
1011 }
1012 }
1013 if ( charid ) {
1014
1015 U16 check;
1016 U32 newstate = 0;
1017
1018 charid--;
1019 if ( !trie->states[ state ].trans.list ) {
1020 TRIE_LIST_NEW( state );
1021 }
1022 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1023 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1024 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1025 break;
1026 }
1027 }
1028 if ( ! newstate ) {
1029 newstate = next_alloc++;
1030 TRIE_LIST_PUSH( state, charid, newstate );
1031 transcount++;
1032 }
1033 state = newstate;
1034
1035 } else {
e4584336 1036 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1037 }
1038 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1039 }
1040
1041 if ( !trie->states[ state ].wordnum ) {
1042 /* we havent inserted this word into the structure yet. */
1043 trie->states[ state ].wordnum = ++curword;
1044
1045 DEBUG_r({
1046 /* store the word for dumping */
1047 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1048 if ( UTF ) SvUTF8_on( tmp );
1049 av_push( trie->words, tmp );
1050 });
1051
1052 } else {
1053 /* Its a dupe. So ignore it. */
1054 }
1055
1056 } /* end second pass */
1057
1058 trie->laststate = next_alloc;
1059 Renew( trie->states, next_alloc, reg_trie_state );
1060
1061 DEBUG_TRIE_COMPILE_MORE_r({
1062 U32 state;
1063 U16 charid;
1064
1065 /*
1066 print out the table precompression.
1067 */
1068
1069 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1070 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1071
1072 for( state=1 ; state < next_alloc ; state ++ ) {
1073
e4584336 1074 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1075 if ( ! trie->states[ state ].wordnum ) {
1076 PerlIO_printf( Perl_debug_log, "%5s| ","");
1077 } else {
e4584336 1078 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1079 trie->states[ state ].wordnum
1080 );
1081 }
1082 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1083 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1084 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
a3621e74
YO
1085 SvPV_nolen( *tmp ),
1086 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1087 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1088 );
1089 }
1090
1091 }
1092 PerlIO_printf( Perl_debug_log, "\n\n" );
1093 });
1094
1095 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1096 {
1097 U32 state;
1098 U16 idx;
1099 U32 tp = 0;
1100 U32 zp = 0;
1101
1102
1103 for( state=1 ; state < next_alloc ; state ++ ) {
1104 U32 base=0;
1105
1106 /*
1107 DEBUG_TRIE_COMPILE_MORE_r(
1108 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1109 );
1110 */
1111
1112 if (trie->states[state].trans.list) {
1113 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1114 U16 maxid=minid;
1115
1116
1117 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1118 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1119 minid=TRIE_LIST_ITEM( state, idx).forid;
1120 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1121 maxid=TRIE_LIST_ITEM( state, idx).forid;
1122 }
1123 }
1124 if ( transcount < tp + maxid - minid + 1) {
1125 transcount *= 2;
1126 Renew( trie->trans, transcount, reg_trie_trans );
1127 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1128 }
1129 base = trie->uniquecharcount + tp - minid;
1130 if ( maxid == minid ) {
1131 U32 set = 0;
1132 for ( ; zp < tp ; zp++ ) {
1133 if ( ! trie->trans[ zp ].next ) {
1134 base = trie->uniquecharcount + zp - minid;
1135 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1136 trie->trans[ zp ].check = state;
1137 set = 1;
1138 break;
1139 }
1140 }
1141 if ( !set ) {
1142 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1143 trie->trans[ tp ].check = state;
1144 tp++;
1145 zp = tp;
1146 }
1147 } else {
1148 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1149 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1150 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1151 trie->trans[ tid ].check = state;
1152 }
1153 tp += ( maxid - minid + 1 );
1154 }
1155 Safefree(trie->states[ state ].trans.list);
1156 }
1157 /*
1158 DEBUG_TRIE_COMPILE_MORE_r(
1159 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1160 );
1161 */
1162 trie->states[ state ].trans.base=base;
1163 }
cc601c31 1164 trie->lasttrans = tp + 1;
a3621e74
YO
1165 }
1166 } else {
1167 /*
1168 Second Pass -- Flat Table Representation.
1169
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1174
1175 We then construct the trie using only the .next slots of the entry
1176 structs.
1177
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1181
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1183 transition.
1184
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1190
1191 A B A B
1192 1. 2 4 1. 3 7
1193 2. 0 3 3. 0 5
1194 3. 0 0 5. 0 0
1195 4. 0 0 7. 0 0
1196
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1200
1201 */
1202
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1204 reg_trie_trans );
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1207
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1209
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1213
1214 U32 state = 1; /* required init */
1215
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1219
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1222
1223
1224 for ( ; uc < e ; uc += len ) {
1225
1226 TRIE_READ_CHAR;
1227
1228 if ( uvc < 256 ) {
1229 charid = trie->charmap[ uvc ];
1230 } else {
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1233 if ( !svpp ) {
1234 charid = 0;
1235 } else {
1236 charid=(U16)SvIV( *svpp );
1237 }
1238 }
1239 if ( charid ) {
1240 charid--;
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1245 }
1246 state = trie->trans[ state + charid ].next;
1247 } else {
e4584336 1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1249 }
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1251 }
1252
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1257
1258 DEBUG_r({
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1263 });
1264
1265 } else {
1266 /* Its a dupe. So ignore it. */
1267 }
1268
1269 } /* end second pass */
1270
1271 DEBUG_TRIE_COMPILE_MORE_r({
1272 /*
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1275 */
1276 U32 state;
1277 U16 charid;
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1279
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1282 if ( tmp ) {
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1284 }
1285 }
1286
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1288
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1291 }
1292
1293 PerlIO_printf( Perl_debug_log, "\n" );
1294
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1296
e4584336 1297 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1298
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1300 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1301 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1302 }
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1304 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1305 } else {
e4584336 1306 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1308 }
1309 }
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1311 });
1312 {
1313 /*
1314 * Inplace compress the table.*
1315
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1319
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1322
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1325
1326 - If .base is 0 there are no valid transitions from that node.
1327
1328 - If .base is nonzero then charid is added to it to find an entry in
1329 the trans array.
1330
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1336 valid.
1337
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1343 0.
1344
1345 - We set 'pos' to indicate the first entry of the second node.
1346
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1353 compression.
1354
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1357
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1364
1365 - Once compressed we can Renew/realloc the structures to release the
1366 excess space.
1367
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1370
1371 demq
1372 */
a3b680e6 1373 const U32 laststate = TRIE_NODENUM( next_alloc );
a3621e74
YO
1374 U32 used , state, charid;
1375 U32 pos = 0, zp=0;
1376 trie->laststate = laststate;
1377
1378 for ( state = 1 ; state < laststate ; state++ ) {
1379 U8 flag = 0;
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1384
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1388 if (o_used == 1) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1391 break;
1392 }
1393 }
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1398 break;
1399 }
1400 used--;
1401 }
1402 if ( !flag ) {
1403 flag = 1;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1405 }
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1408 pos++;
1409 }
1410 }
1411 }
cc601c31 1412 trie->lasttrans = pos + 1;
a3621e74
YO
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1415 PerlIO_printf( Perl_debug_log,
1416 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1417 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1418 (IV)next_alloc,
1419 (IV)pos,
a3621e74
YO
1420 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1421 );
1422
1423 } /* end table compress */
1424 }
cc601c31
YO
1425 /* resize the trans array to remove unused space */
1426 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1427
1428 DEBUG_TRIE_COMPILE_r({
1429 U32 state;
1430 /*
1431 Now we print it out again, in a slightly different form as there is additional
1432 info we want to be able to see when its compressed. They are close enough for
1433 visual comparison though.
1434 */
1435 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1436
1437 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1438 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1439 if ( tmp ) {
1440 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1441 }
1442 }
1443 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1444
a3621e74
YO
1445 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1446 PerlIO_printf( Perl_debug_log, "-----");
1447 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1448
a3621e74
YO
1449 for( state = 1 ; state < trie->laststate ; state++ ) {
1450 U32 base = trie->states[ state ].trans.base;
1451
e4584336 1452 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1453
1454 if ( trie->states[ state ].wordnum ) {
1455 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1456 } else {
1457 PerlIO_printf( Perl_debug_log, "%6s", "" );
1458 }
1459
e4584336 1460 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1461
1462 if ( base ) {
1463 U32 ofs = 0;
1464
cc601c31
YO
1465 while( ( base + ofs < trie->uniquecharcount ) ||
1466 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1467 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1468 ofs++;
1469
e4584336 1470 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1471
1472 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1473 if ( ( base + ofs >= trie->uniquecharcount ) &&
1474 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1475 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1476 {
e4584336
RB
1477 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1478 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1479 } else {
1480 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1481 }
1482 }
1483
e4584336 1484 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1485
1486 }
1487 PerlIO_printf( Perl_debug_log, "\n" );
1488 }
1489 });
1490
1491 {
1492 /* now finally we "stitch in" the new TRIE node
1493 This means we convert either the first branch or the first Exact,
1494 depending on whether the thing following (in 'last') is a branch
1495 or not and whther first is the startbranch (ie is it a sub part of
1496 the alternation or is it the whole thing.)
1497 Assuming its a sub part we conver the EXACT otherwise we convert
1498 the whole branch sequence, including the first.
1499 */
1500 regnode *convert;
1501
1502
1503
1504
1505 if ( first == startbranch && OP( last ) != BRANCH ) {
1506 convert = first;
1507 } else {
1508 convert = NEXTOPER( first );
1509 NEXT_OFF( first ) = (U16)(last - first);
1510 }
1511
1512 OP( convert ) = TRIE + (U8)( flags - EXACT );
1513 NEXT_OFF( convert ) = (U16)(tail - convert);
1514 ARG_SET( convert, data_slot );
1515
1516 /* tells us if we need to handle accept buffers specially */
1517 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1518
1519
1520 /* needed for dumping*/
1521 DEBUG_r({
1522 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1523 /* We now need to mark all of the space originally used by the
1524 branches as optimized away. This keeps the dumpuntil from
1525 throwing a wobbly as it doesnt use regnext() to traverse the
1526 opcodes.
1527 */
1528 while( optimize < last ) {
1529 OP( optimize ) = OPTIMIZED;
1530 optimize++;
1531 }
1532 });
1533 } /* end node insert */
1534 return 1;
1535}
1536
1537
1538
1539/*
5d1c421c
JH
1540 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1541 * These need to be revisited when a newer toolchain becomes available.
1542 */
1543#if defined(__sparc64__) && defined(__GNUC__)
1544# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1545# undef SPARC64_GCC_WORKAROUND
1546# define SPARC64_GCC_WORKAROUND 1
1547# endif
1548#endif
1549
653099ff
GS
1550/* REx optimizer. Converts nodes into quickier variants "in place".
1551 Finds fixed substrings. */
1552
a0288114 1553/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1554 to the position after last scanned or to NULL. */
1555
a3621e74 1556
76e3520e 1557STATIC I32
a3621e74 1558S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1559 /* scanp: Start here (read-write). */
1560 /* deltap: Write maxlen-minlen here. */
1561 /* last: Stop before this one. */
1562{
1563 I32 min = 0, pars = 0, code;
1564 regnode *scan = *scanp, *next;
1565 I32 delta = 0;
1566 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1567 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1568 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1569 scan_data_t data_fake;
653099ff 1570 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1571 SV *re_trie_maxbuff = NULL;
1572
1573 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1574
c277df42
IZ
1575 while (scan && OP(scan) != END && scan < last) {
1576 /* Peephole optimizer: */
a3621e74
YO
1577 DEBUG_OPTIMISE_r({
1578 SV *mysv=sv_newmortal();
1579 regprop( mysv, scan);
e4584336
RB
1580 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1581 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
a3621e74 1582 });
c277df42 1583
22c35a8c 1584 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1585 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1586 regnode *n = regnext(scan);
1587 U32 stringok = 1;
1588#ifdef DEBUGGING
1589 regnode *stop = scan;
b81d288d 1590#endif
c277df42 1591
cd439c50 1592 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1593 /* Skip NOTHING, merge EXACT*. */
1594 while (n &&
b81d288d 1595 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1596 (stringok && (OP(n) == OP(scan))))
1597 && NEXT_OFF(n)
1598 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1599 if (OP(n) == TAIL || n > next)
1600 stringok = 0;
22c35a8c 1601 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1602 NEXT_OFF(scan) += NEXT_OFF(n);
1603 next = n + NODE_STEP_REGNODE;
1604#ifdef DEBUGGING
1605 if (stringok)
1606 stop = n;
b81d288d 1607#endif
c277df42 1608 n = regnext(n);
a0ed51b3 1609 }
f49d4d0f 1610 else if (stringok) {
a3b680e6 1611 const int oldl = STR_LEN(scan);
c277df42 1612 regnode *nnext = regnext(n);
f49d4d0f 1613
b81d288d 1614 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1615 break;
1616 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1617 STR_LEN(scan) += STR_LEN(n);
1618 next = n + NODE_SZ_STR(n);
c277df42 1619 /* Now we can overwrite *n : */
f49d4d0f 1620 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1621#ifdef DEBUGGING
f49d4d0f 1622 stop = next - 1;
b81d288d 1623#endif
c277df42
IZ
1624 n = nnext;
1625 }
1626 }
61a36c01 1627
a3621e74 1628 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1629/*
1630 Two problematic code points in Unicode casefolding of EXACT nodes:
1631
1632 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1633 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1634
1635 which casefold to
1636
1637 Unicode UTF-8
1638
1639 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1640 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1641
1642 This means that in case-insensitive matching (or "loose matching",
1643 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1644 length of the above casefolded versions) can match a target string
1645 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1646 This would rather mess up the minimum length computation.
1647
1648 What we'll do is to look for the tail four bytes, and then peek
1649 at the preceding two bytes to see whether we need to decrease
1650 the minimum length by four (six minus two).
1651
1652 Thanks to the design of UTF-8, there cannot be false matches:
1653 A sequence of valid UTF-8 bytes cannot be a subsequence of
1654 another valid sequence of UTF-8 bytes.
1655
1656*/
1657 char *s0 = STRING(scan), *s, *t;
1658 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
bfed75c6
AL
1659 const char *t0 = "\xcc\x88\xcc\x81";
1660 const char *t1 = t0 + 3;
2af232bd 1661
61a36c01
JH
1662 for (s = s0 + 2;
1663 s < s2 && (t = ninstr(s, s1, t0, t1));
1664 s = t + 4) {
1665 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1666 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1667 min -= 4;
1668 }
1669 }
1670
c277df42
IZ
1671#ifdef DEBUGGING
1672 /* Allow dumping */
cd439c50 1673 n = scan + NODE_SZ_STR(scan);
c277df42 1674 while (n <= stop) {
22c35a8c 1675 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1676 OP(n) = OPTIMIZED;
1677 NEXT_OFF(n) = 0;
1678 }
1679 n++;
1680 }
653099ff 1681#endif
c277df42 1682 }
a3621e74
YO
1683
1684
1685
653099ff
GS
1686 /* Follow the next-chain of the current node and optimize
1687 away all the NOTHINGs from it. */
c277df42 1688 if (OP(scan) != CURLYX) {
a3b680e6 1689 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1690 ? I32_MAX
1691 /* I32 may be smaller than U16 on CRAYs! */
1692 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1693 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1694 int noff;
1695 regnode *n = scan;
b81d288d 1696
c277df42
IZ
1697 /* Skip NOTHING and LONGJMP. */
1698 while ((n = regnext(n))
22c35a8c 1699 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1700 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1701 && off + noff < max)
1702 off += noff;
1703 if (reg_off_by_arg[OP(scan)])
1704 ARG(scan) = off;
b81d288d 1705 else
c277df42
IZ
1706 NEXT_OFF(scan) = off;
1707 }
a3621e74 1708
653099ff
GS
1709 /* The principal pseudo-switch. Cannot be a switch, since we
1710 look into several different things. */
b81d288d 1711 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1712 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1713 next = regnext(scan);
1714 code = OP(scan);
a3621e74 1715 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1716
1717 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1718 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1719 struct regnode_charclass_class accum;
a3621e74 1720 regnode *startbranch=scan;
c277df42 1721
653099ff 1722 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1723 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1724 if (flags & SCF_DO_STCLASS)
830247a4 1725 cl_init_zero(pRExC_state, &accum);
a3621e74 1726
c277df42 1727 while (OP(scan) == code) {
830247a4 1728 I32 deltanext, minnext, f = 0, fake;
653099ff 1729 struct regnode_charclass_class this_class;
c277df42
IZ
1730
1731 num++;
1732 data_fake.flags = 0;
b81d288d 1733 if (data) {
2c2d71f5 1734 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1735 data_fake.last_closep = data->last_closep;
1736 }
1737 else
1738 data_fake.last_closep = &fake;
c277df42
IZ
1739 next = regnext(scan);
1740 scan = NEXTOPER(scan);
1741 if (code != BRANCH)
1742 scan = NEXTOPER(scan);
653099ff 1743 if (flags & SCF_DO_STCLASS) {
830247a4 1744 cl_init(pRExC_state, &this_class);
653099ff
GS
1745 data_fake.start_class = &this_class;
1746 f = SCF_DO_STCLASS_AND;
b81d288d 1747 }
e1901655
IZ
1748 if (flags & SCF_WHILEM_VISITED_POS)
1749 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1750
653099ff 1751 /* we suppose the run is continuous, last=next...*/
830247a4 1752 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1753 next, &data_fake, f,depth+1);
b81d288d 1754 if (min1 > minnext)
c277df42
IZ
1755 min1 = minnext;
1756 if (max1 < minnext + deltanext)
1757 max1 = minnext + deltanext;
1758 if (deltanext == I32_MAX)
aca2d497 1759 is_inf = is_inf_internal = 1;
c277df42
IZ
1760 scan = next;
1761 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1762 pars++;
405ff068 1763 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1764 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1765 if (data)
1766 data->whilem_c = data_fake.whilem_c;
653099ff 1767 if (flags & SCF_DO_STCLASS)
830247a4 1768 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1769 if (code == SUSPEND)
c277df42
IZ
1770 break;
1771 }
1772 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1773 min1 = 0;
1774 if (flags & SCF_DO_SUBSTR) {
1775 data->pos_min += min1;
1776 data->pos_delta += max1 - min1;
1777 if (max1 != min1 || is_inf)
1778 data->longest = &(data->longest_float);
1779 }
1780 min += min1;
1781 delta += max1 - min1;
653099ff 1782 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1783 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1784 if (min1) {
1785 cl_and(data->start_class, &and_with);
1786 flags &= ~SCF_DO_STCLASS;
1787 }
1788 }
1789 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1790 if (min1) {
1791 cl_and(data->start_class, &accum);
653099ff 1792 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1793 }
1794 else {
b81d288d 1795 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1796 * data->start_class */
1797 StructCopy(data->start_class, &and_with,
1798 struct regnode_charclass_class);
1799 flags &= ~SCF_DO_STCLASS_AND;
1800 StructCopy(&accum, data->start_class,
1801 struct regnode_charclass_class);
1802 flags |= SCF_DO_STCLASS_OR;
1803 data->start_class->flags |= ANYOF_EOS;
1804 }
653099ff 1805 }
a3621e74
YO
1806
1807 /* demq.
1808
1809 Assuming this was/is a branch we are dealing with: 'scan' now
1810 points at the item that follows the branch sequence, whatever
1811 it is. We now start at the beginning of the sequence and look
1812 for subsequences of
1813
1814 BRANCH->EXACT=>X
1815 BRANCH->EXACT=>X
1816
1817 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1818
1819 If we can find such a subseqence we need to turn the first
1820 element into a trie and then add the subsequent branch exact
1821 strings to the trie.
1822
1823 We have two cases
1824
1825 1. patterns where the whole set of branch can be converted to a trie,
1826
1827 2. patterns where only a subset of the alternations can be
1828 converted to a trie.
1829
1830 In case 1 we can replace the whole set with a single regop
1831 for the trie. In case 2 we need to keep the start and end
1832 branchs so
1833
1834 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1835 becomes BRANCH TRIE; BRANCH X;
1836
1837 Hypthetically when we know the regex isnt anchored we can
1838 turn a case 1 into a DFA and let it rip... Every time it finds a match
1839 it would just call its tail, no WHILEM/CURLY needed.
1840
1841 */
0111c4fd
RGS
1842 if (DO_TRIE) {
1843 if (!re_trie_maxbuff) {
1844 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1845 if (!SvIOK(re_trie_maxbuff))
1846 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1847 }
a3621e74
YO
1848 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1849 regnode *cur;
1850 regnode *first = (regnode *)NULL;
1851 regnode *last = (regnode *)NULL;
1852 regnode *tail = scan;
1853 U8 optype = 0;
1854 U32 count=0;
1855
1856#ifdef DEBUGGING
1857 SV *mysv = sv_newmortal(); /* for dumping */
1858#endif
1859 /* var tail is used because there may be a TAIL
1860 regop in the way. Ie, the exacts will point to the
1861 thing following the TAIL, but the last branch will
1862 point at the TAIL. So we advance tail. If we
1863 have nested (?:) we may have to move through several
1864 tails.
1865 */
1866
1867 while ( OP( tail ) == TAIL ) {
1868 /* this is the TAIL generated by (?:) */
1869 tail = regnext( tail );
1870 }
1871
1872 DEBUG_OPTIMISE_r({
1873 regprop( mysv, tail );
1874 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
e4584336 1875 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
a3621e74
YO
1876 (RExC_seen_evals) ? "[EVAL]" : ""
1877 );
1878 });
1879 /*
1880
1881 step through the branches, cur represents each
1882 branch, noper is the first thing to be matched
1883 as part of that branch and noper_next is the
1884 regnext() of that node. if noper is an EXACT
1885 and noper_next is the same as scan (our current
1886 position in the regex) then the EXACT branch is
1887 a possible optimization target. Once we have
1888 two or more consequetive such branches we can
1889 create a trie of the EXACT's contents and stich
1890 it in place. If the sequence represents all of
1891 the branches we eliminate the whole thing and
1892 replace it with a single TRIE. If it is a
1893 subsequence then we need to stitch it in. This
1894 means the first branch has to remain, and needs
1895 to be repointed at the item on the branch chain
1896 following the last branch optimized. This could
1897 be either a BRANCH, in which case the
1898 subsequence is internal, or it could be the
1899 item following the branch sequence in which
1900 case the subsequence is at the end.
1901
1902 */
1903
1904 /* dont use tail as the end marker for this traverse */
1905 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1906 regnode *noper = NEXTOPER( cur );
1907 regnode *noper_next = regnext( noper );
1908
a3621e74
YO
1909 DEBUG_OPTIMISE_r({
1910 regprop( mysv, cur);
1911 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1912 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
a3621e74
YO
1913
1914 regprop( mysv, noper);
1915 PerlIO_printf( Perl_debug_log, " -> %s",
1916 SvPV_nolen(mysv));
1917
1918 if ( noper_next ) {
1919 regprop( mysv, noper_next );
1920 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1921 SvPV_nolen(mysv));
1922 }
1923 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1924 first, last, cur );
1925 });
1926 if ( ( first ? OP( noper ) == optype
1927 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1928 && noper_next == tail && count<U16_MAX)
1929 {
1930 count++;
1931 if ( !first ) {
1932 first = cur;
1933 optype = OP( noper );
1934 } else {
1935 DEBUG_OPTIMISE_r(
1936 if (!last ) {
1937 regprop( mysv, first);
1938 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1939 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
a3621e74
YO
1940 regprop( mysv, NEXTOPER(first) );
1941 PerlIO_printf( Perl_debug_log, " -> %s\n",
1942 SvPV_nolen( mysv ) );
1943 }
1944 );
1945 last = cur;
1946 DEBUG_OPTIMISE_r({
1947 regprop( mysv, cur);
1948 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1949 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
a3621e74
YO
1950 regprop( mysv, noper );
1951 PerlIO_printf( Perl_debug_log, " -> %s\n",
1952 SvPV_nolen( mysv ) );
1953 });
1954 }
1955 } else {
1956 if ( last ) {
1957 DEBUG_OPTIMISE_r(
1958 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1959 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1960 );
1961 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1962 }
1963 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1964 && noper_next == tail )
1965 {
1966 count = 1;
1967 first = cur;
1968 optype = OP( noper );
1969 } else {
1970 count = 0;
1971 first = NULL;
1972 optype = 0;
1973 }
1974 last = NULL;
1975 }
1976 }
1977 DEBUG_OPTIMISE_r({
1978 regprop( mysv, cur);
1979 PerlIO_printf( Perl_debug_log,
e4584336 1980 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
a3621e74
YO
1981 " ", SvPV_nolen( mysv ), first, last, cur);
1982
1983 });
1984 if ( last ) {
1985 DEBUG_OPTIMISE_r(
1986 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1987 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1988 );
1989 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1990 }
1991 }
1992 }
a0ed51b3 1993 }
a3621e74 1994 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1995 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1996 } else /* single branch is optimized. */
c277df42
IZ
1997 scan = NEXTOPER(scan);
1998 continue;
a0ed51b3
LW
1999 }
2000 else if (OP(scan) == EXACT) {
cd439c50 2001 I32 l = STR_LEN(scan);
1aa99e6b 2002 UV uc = *((U8*)STRING(scan));
a0ed51b3 2003 if (UTF) {
a3b680e6 2004 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2005 l = utf8_length(s, s + l);
9041c2e3 2006 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2007 }
2008 min += l;
c277df42 2009 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2010 /* The code below prefers earlier match for fixed
2011 offset, later match for variable offset. */
2012 if (data->last_end == -1) { /* Update the start info. */
2013 data->last_start_min = data->pos_min;
2014 data->last_start_max = is_inf
b81d288d 2015 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2016 }
cd439c50 2017 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2018 {
2019 SV * sv = data->last_found;
2020 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2021 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2022 if (mg && mg->mg_len >= 0)
5e43f467
JH
2023 mg->mg_len += utf8_length((U8*)STRING(scan),
2024 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2025 }
33b8afdf
JH
2026 if (UTF)
2027 SvUTF8_on(data->last_found);
c277df42
IZ
2028 data->last_end = data->pos_min + l;
2029 data->pos_min += l; /* As in the first entry. */
2030 data->flags &= ~SF_BEFORE_EOL;
2031 }
653099ff
GS
2032 if (flags & SCF_DO_STCLASS_AND) {
2033 /* Check whether it is compatible with what we know already! */
2034 int compat = 1;
2035
1aa99e6b 2036 if (uc >= 0x100 ||
516a5887 2037 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2038 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2039 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2040 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2041 )
653099ff
GS
2042 compat = 0;
2043 ANYOF_CLASS_ZERO(data->start_class);
2044 ANYOF_BITMAP_ZERO(data->start_class);
2045 if (compat)
1aa99e6b 2046 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2047 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2048 if (uc < 0x100)
2049 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2050 }
2051 else if (flags & SCF_DO_STCLASS_OR) {
2052 /* false positive possible if the class is case-folded */
1aa99e6b 2053 if (uc < 0x100)
9b877dbb
IH
2054 ANYOF_BITMAP_SET(data->start_class, uc);
2055 else
2056 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2057 data->start_class->flags &= ~ANYOF_EOS;
2058 cl_and(data->start_class, &and_with);
2059 }
2060 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2061 }
653099ff 2062 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2063 I32 l = STR_LEN(scan);
1aa99e6b 2064 UV uc = *((U8*)STRING(scan));
653099ff
GS
2065
2066 /* Search for fixed substrings supports EXACT only. */
b81d288d 2067 if (flags & SCF_DO_SUBSTR)
830247a4 2068 scan_commit(pRExC_state, data);
a0ed51b3 2069 if (UTF) {
1aa99e6b
IH
2070 U8 *s = (U8 *)STRING(scan);
2071 l = utf8_length(s, s + l);
9041c2e3 2072 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2073 }
2074 min += l;
c277df42 2075 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2076 data->pos_min += l;
653099ff
GS
2077 if (flags & SCF_DO_STCLASS_AND) {
2078 /* Check whether it is compatible with what we know already! */
2079 int compat = 1;
2080
1aa99e6b 2081 if (uc >= 0x100 ||
516a5887 2082 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2083 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2084 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2085 compat = 0;
2086 ANYOF_CLASS_ZERO(data->start_class);
2087 ANYOF_BITMAP_ZERO(data->start_class);
2088 if (compat) {
1aa99e6b 2089 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2090 data->start_class->flags &= ~ANYOF_EOS;
2091 data->start_class->flags |= ANYOF_FOLD;
2092 if (OP(scan) == EXACTFL)
2093 data->start_class->flags |= ANYOF_LOCALE;
2094 }
2095 }
2096 else if (flags & SCF_DO_STCLASS_OR) {
2097 if (data->start_class->flags & ANYOF_FOLD) {
2098 /* false positive possible if the class is case-folded.
2099 Assume that the locale settings are the same... */
1aa99e6b
IH
2100 if (uc < 0x100)
2101 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2102 data->start_class->flags &= ~ANYOF_EOS;
2103 }
2104 cl_and(data->start_class, &and_with);
2105 }
2106 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2107 }
bfed75c6 2108 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2109 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2110 I32 f = flags, pos_before = 0;
c277df42 2111 regnode *oscan = scan;
653099ff
GS
2112 struct regnode_charclass_class this_class;
2113 struct regnode_charclass_class *oclass = NULL;
727f22e3 2114 I32 next_is_eval = 0;
653099ff 2115
22c35a8c 2116 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2117 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2118 scan = NEXTOPER(scan);
2119 goto finish;
2120 case PLUS:
653099ff 2121 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2122 next = NEXTOPER(scan);
653099ff 2123 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2124 mincount = 1;
2125 maxcount = REG_INFTY;
c277df42
IZ
2126 next = regnext(scan);
2127 scan = NEXTOPER(scan);
2128 goto do_curly;
2129 }
2130 }
2131 if (flags & SCF_DO_SUBSTR)
2132 data->pos_min++;
2133 min++;
2134 /* Fall through. */
2135 case STAR:
653099ff
GS
2136 if (flags & SCF_DO_STCLASS) {
2137 mincount = 0;
b81d288d 2138 maxcount = REG_INFTY;
653099ff
GS
2139 next = regnext(scan);
2140 scan = NEXTOPER(scan);
2141 goto do_curly;
2142 }
b81d288d 2143 is_inf = is_inf_internal = 1;
c277df42
IZ
2144 scan = regnext(scan);
2145 if (flags & SCF_DO_SUBSTR) {
830247a4 2146 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2147 data->longest = &(data->longest_float);
2148 }
2149 goto optimize_curly_tail;
2150 case CURLY:
b81d288d 2151 mincount = ARG1(scan);
c277df42
IZ
2152 maxcount = ARG2(scan);
2153 next = regnext(scan);
cb434fcc
IZ
2154 if (OP(scan) == CURLYX) {
2155 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2156 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2157 }
c277df42 2158 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2159 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2160 do_curly:
2161 if (flags & SCF_DO_SUBSTR) {
830247a4 2162 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2163 pos_before = data->pos_min;
2164 }
2165 if (data) {
2166 fl = data->flags;
2167 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2168 if (is_inf)
2169 data->flags |= SF_IS_INF;
2170 }
653099ff 2171 if (flags & SCF_DO_STCLASS) {
830247a4 2172 cl_init(pRExC_state, &this_class);
653099ff
GS
2173 oclass = data->start_class;
2174 data->start_class = &this_class;
2175 f |= SCF_DO_STCLASS_AND;
2176 f &= ~SCF_DO_STCLASS_OR;
2177 }
e1901655
IZ
2178 /* These are the cases when once a subexpression
2179 fails at a particular position, it cannot succeed
2180 even after backtracking at the enclosing scope.
b81d288d 2181
e1901655
IZ
2182 XXXX what if minimal match and we are at the
2183 initial run of {n,m}? */
2184 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2185 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2186
c277df42 2187 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2188 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2189 (mincount == 0
2190 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2191
2192 if (flags & SCF_DO_STCLASS)
2193 data->start_class = oclass;
2194 if (mincount == 0 || minnext == 0) {
2195 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2196 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2197 }
2198 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2199 /* Switch to OR mode: cache the old value of
653099ff
GS
2200 * data->start_class */
2201 StructCopy(data->start_class, &and_with,
2202 struct regnode_charclass_class);
2203 flags &= ~SCF_DO_STCLASS_AND;
2204 StructCopy(&this_class, data->start_class,
2205 struct regnode_charclass_class);
2206 flags |= SCF_DO_STCLASS_OR;
2207 data->start_class->flags |= ANYOF_EOS;
2208 }
2209 } else { /* Non-zero len */
2210 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2211 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2212 cl_and(data->start_class, &and_with);
2213 }
2214 else if (flags & SCF_DO_STCLASS_AND)
2215 cl_and(data->start_class, &this_class);
2216 flags &= ~SCF_DO_STCLASS;
2217 }
c277df42
IZ
2218 if (!scan) /* It was not CURLYX, but CURLY. */
2219 scan = next;
84037bb0 2220 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2221 /* ? quantifier ok, except for (?{ ... }) */
2222 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2223 && (minnext == 0) && (deltanext == 0)
99799961 2224 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2225 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2226 {
830247a4 2227 vWARN(RExC_parse,
b45f050a
JF
2228 "Quantifier unexpected on zero-length expression");
2229 }
2230
c277df42 2231 min += minnext * mincount;
b81d288d 2232 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2233 && (minnext + deltanext) > 0)
2234 || deltanext == I32_MAX);
aca2d497 2235 is_inf |= is_inf_internal;
c277df42
IZ
2236 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2237
2238 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2239 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2240 && data->flags & SF_IN_PAR
2241 && !(data->flags & SF_HAS_EVAL)
2242 && !deltanext && minnext == 1 ) {
2243 /* Try to optimize to CURLYN. */
2244 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2245 regnode *nxt1 = nxt;
2246#ifdef DEBUGGING
2247 regnode *nxt2;
2248#endif
c277df42
IZ
2249
2250 /* Skip open. */
2251 nxt = regnext(nxt);
bfed75c6 2252 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2253 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2254 && STR_LEN(nxt) == 1))
c277df42 2255 goto nogo;
497b47a8 2256#ifdef DEBUGGING
c277df42 2257 nxt2 = nxt;
497b47a8 2258#endif
c277df42 2259 nxt = regnext(nxt);
b81d288d 2260 if (OP(nxt) != CLOSE)
c277df42
IZ
2261 goto nogo;
2262 /* Now we know that nxt2 is the only contents: */
eb160463 2263 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2264 OP(oscan) = CURLYN;
2265 OP(nxt1) = NOTHING; /* was OPEN. */
2266#ifdef DEBUGGING
2267 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2268 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2269 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2270 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2271 OP(nxt + 1) = OPTIMIZED; /* was count. */
2272 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2273#endif
c277df42 2274 }
c277df42
IZ
2275 nogo:
2276
2277 /* Try optimization CURLYX => CURLYM. */
b81d288d 2278 if ( OP(oscan) == CURLYX && data
c277df42 2279 && !(data->flags & SF_HAS_PAR)
c277df42 2280 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2281 && !deltanext /* atom is fixed width */
2282 && minnext != 0 /* CURLYM can't handle zero width */
2283 ) {
c277df42
IZ
2284 /* XXXX How to optimize if data == 0? */
2285 /* Optimize to a simpler form. */
2286 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2287 regnode *nxt2;
2288
2289 OP(oscan) = CURLYM;
2290 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2291 && (OP(nxt2) != WHILEM))
c277df42
IZ
2292 nxt = nxt2;
2293 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2294 /* Need to optimize away parenths. */
2295 if (data->flags & SF_IN_PAR) {
2296 /* Set the parenth number. */
2297 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2298
b81d288d 2299 if (OP(nxt) != CLOSE)
b45f050a 2300 FAIL("Panic opt close");
eb160463 2301 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2302 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2303 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2304#ifdef DEBUGGING
2305 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2306 OP(nxt + 1) = OPTIMIZED; /* was count. */
2307 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2308 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2309#endif
c277df42
IZ
2310#if 0
2311 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2312 regnode *nnxt = regnext(nxt1);
b81d288d 2313
c277df42
IZ
2314 if (nnxt == nxt) {
2315 if (reg_off_by_arg[OP(nxt1)])
2316 ARG_SET(nxt1, nxt2 - nxt1);
2317 else if (nxt2 - nxt1 < U16_MAX)
2318 NEXT_OFF(nxt1) = nxt2 - nxt1;
2319 else
2320 OP(nxt) = NOTHING; /* Cannot beautify */
2321 }
2322 nxt1 = nnxt;
2323 }
2324#endif
2325 /* Optimize again: */
b81d288d 2326 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2327 NULL, 0,depth+1);
a0ed51b3
LW
2328 }
2329 else
c277df42 2330 oscan->flags = 0;
c277df42 2331 }
e1901655
IZ
2332 else if ((OP(oscan) == CURLYX)
2333 && (flags & SCF_WHILEM_VISITED_POS)
2334 /* See the comment on a similar expression above.
2335 However, this time it not a subexpression
2336 we care about, but the expression itself. */
2337 && (maxcount == REG_INFTY)
2338 && data && ++data->whilem_c < 16) {
2339 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2340 /* Find WHILEM (as in regexec.c) */
2341 regnode *nxt = oscan + NEXT_OFF(oscan);
2342
2343 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2344 nxt += ARG(nxt);
eb160463
GS
2345 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2346 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2347 }
b81d288d 2348 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2349 pars++;
2350 if (flags & SCF_DO_SUBSTR) {
2351 SV *last_str = Nullsv;
2352 int counted = mincount != 0;
2353
2354 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2355#if defined(SPARC64_GCC_WORKAROUND)
2356 I32 b = 0;
2357 STRLEN l = 0;
2358 char *s = NULL;
2359 I32 old = 0;
2360
2361 if (pos_before >= data->last_start_min)
2362 b = pos_before;
2363 else
2364 b = data->last_start_min;
2365
2366 l = 0;
2367 s = SvPV(data->last_found, l);
2368 old = b - data->last_start_min;
2369
2370#else
b81d288d 2371 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2372 ? pos_before : data->last_start_min;
2373 STRLEN l;
2374 char *s = SvPV(data->last_found, l);
a0ed51b3 2375 I32 old = b - data->last_start_min;
5d1c421c 2376#endif
a0ed51b3
LW
2377
2378 if (UTF)
2379 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2380
a0ed51b3 2381 l -= old;
c277df42 2382 /* Get the added string: */
79cb57f6 2383 last_str = newSVpvn(s + old, l);
0e933229
IH
2384 if (UTF)
2385 SvUTF8_on(last_str);
c277df42
IZ
2386 if (deltanext == 0 && pos_before == b) {
2387 /* What was added is a constant string */
2388 if (mincount > 1) {
2389 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2390 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2391 SvPVX_const(last_str), l, mincount - 1);
b162af07 2392 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2393 /* Add additional parts. */
b81d288d 2394 SvCUR_set(data->last_found,
c277df42
IZ
2395 SvCUR(data->last_found) - l);
2396 sv_catsv(data->last_found, last_str);
0eda9292
JH
2397 {
2398 SV * sv = data->last_found;
2399 MAGIC *mg =
2400 SvUTF8(sv) && SvMAGICAL(sv) ?
2401 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2402 if (mg && mg->mg_len >= 0)
2403 mg->mg_len += CHR_SVLEN(last_str);
2404 }
c277df42
IZ
2405 data->last_end += l * (mincount - 1);
2406 }
2a8d9689
HS
2407 } else {
2408 /* start offset must point into the last copy */
2409 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2410 data->last_start_max += is_inf ? I32_MAX
2411 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2412 }
2413 }
2414 /* It is counted once already... */
2415 data->pos_min += minnext * (mincount - counted);
2416 data->pos_delta += - counted * deltanext +
2417 (minnext + deltanext) * maxcount - minnext * mincount;
2418 if (mincount != maxcount) {
653099ff
GS
2419 /* Cannot extend fixed substrings found inside
2420 the group. */
830247a4 2421 scan_commit(pRExC_state,data);
c277df42
IZ
2422 if (mincount && last_str) {
2423 sv_setsv(data->last_found, last_str);
2424 data->last_end = data->pos_min;
b81d288d 2425 data->last_start_min =
a0ed51b3 2426 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2427 data->last_start_max = is_inf
2428 ? I32_MAX
c277df42 2429 : data->pos_min + data->pos_delta
a0ed51b3 2430 - CHR_SVLEN(last_str);
c277df42
IZ
2431 }
2432 data->longest = &(data->longest_float);
2433 }
aca2d497 2434 SvREFCNT_dec(last_str);
c277df42 2435 }
405ff068 2436 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2437 data->flags |= SF_HAS_EVAL;
2438 optimize_curly_tail:
c277df42 2439 if (OP(oscan) != CURLYX) {
22c35a8c 2440 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2441 && NEXT_OFF(next))
2442 NEXT_OFF(oscan) += NEXT_OFF(next);
2443 }
c277df42 2444 continue;
653099ff 2445 default: /* REF and CLUMP only? */
c277df42 2446 if (flags & SCF_DO_SUBSTR) {
830247a4 2447 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2448 data->longest = &(data->longest_float);
2449 }
aca2d497 2450 is_inf = is_inf_internal = 1;
653099ff 2451 if (flags & SCF_DO_STCLASS_OR)
830247a4 2452 cl_anything(pRExC_state, data->start_class);
653099ff 2453 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2454 break;
2455 }
a0ed51b3 2456 }
bfed75c6 2457 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2458 int value = 0;
653099ff 2459
c277df42 2460 if (flags & SCF_DO_SUBSTR) {
830247a4 2461 scan_commit(pRExC_state,data);
c277df42
IZ
2462 data->pos_min++;
2463 }
2464 min++;
653099ff
GS
2465 if (flags & SCF_DO_STCLASS) {
2466 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2467
2468 /* Some of the logic below assumes that switching
2469 locale on will only add false positives. */
2470 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2471 case SANY:
653099ff
GS
2472 default:
2473 do_default:
2474 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2475 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2476 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2477 break;
2478 case REG_ANY:
2479 if (OP(scan) == SANY)
2480 goto do_default;
2481 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2482 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2483 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2484 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2485 }
2486 if (flags & SCF_DO_STCLASS_AND || !value)
2487 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2488 break;
2489 case ANYOF:
2490 if (flags & SCF_DO_STCLASS_AND)
2491 cl_and(data->start_class,
2492 (struct regnode_charclass_class*)scan);
2493 else
830247a4 2494 cl_or(pRExC_state, data->start_class,
653099ff
GS
2495 (struct regnode_charclass_class*)scan);
2496 break;
2497 case ALNUM:
2498 if (flags & SCF_DO_STCLASS_AND) {
2499 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2501 for (value = 0; value < 256; value++)
2502 if (!isALNUM(value))
2503 ANYOF_BITMAP_CLEAR(data->start_class, value);
2504 }
2505 }
2506 else {
2507 if (data->start_class->flags & ANYOF_LOCALE)
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2509 else {
2510 for (value = 0; value < 256; value++)
2511 if (isALNUM(value))
b81d288d 2512 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2513 }
2514 }
2515 break;
2516 case ALNUML:
2517 if (flags & SCF_DO_STCLASS_AND) {
2518 if (data->start_class->flags & ANYOF_LOCALE)
2519 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2520 }
2521 else {
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2523 data->start_class->flags |= ANYOF_LOCALE;
2524 }
2525 break;
2526 case NALNUM:
2527 if (flags & SCF_DO_STCLASS_AND) {
2528 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2530 for (value = 0; value < 256; value++)
2531 if (isALNUM(value))
2532 ANYOF_BITMAP_CLEAR(data->start_class, value);
2533 }
2534 }
2535 else {
2536 if (data->start_class->flags & ANYOF_LOCALE)
2537 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2538 else {
2539 for (value = 0; value < 256; value++)
2540 if (!isALNUM(value))
b81d288d 2541 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2542 }
2543 }
2544 break;
2545 case NALNUML:
2546 if (flags & SCF_DO_STCLASS_AND) {
2547 if (data->start_class->flags & ANYOF_LOCALE)
2548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2549 }
2550 else {
2551 data->start_class->flags |= ANYOF_LOCALE;
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2553 }
2554 break;
2555 case SPACE:
2556 if (flags & SCF_DO_STCLASS_AND) {
2557 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2558 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2559 for (value = 0; value < 256; value++)
2560 if (!isSPACE(value))
2561 ANYOF_BITMAP_CLEAR(data->start_class, value);
2562 }
2563 }
2564 else {
2565 if (data->start_class->flags & ANYOF_LOCALE)
2566 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2567 else {
2568 for (value = 0; value < 256; value++)
2569 if (isSPACE(value))
b81d288d 2570 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2571 }
2572 }
2573 break;
2574 case SPACEL:
2575 if (flags & SCF_DO_STCLASS_AND) {
2576 if (data->start_class->flags & ANYOF_LOCALE)
2577 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2578 }
2579 else {
2580 data->start_class->flags |= ANYOF_LOCALE;
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2582 }
2583 break;
2584 case NSPACE:
2585 if (flags & SCF_DO_STCLASS_AND) {
2586 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2587 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2588 for (value = 0; value < 256; value++)
2589 if (isSPACE(value))
2590 ANYOF_BITMAP_CLEAR(data->start_class, value);
2591 }
2592 }
2593 else {
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2596 else {
2597 for (value = 0; value < 256; value++)
2598 if (!isSPACE(value))
b81d288d 2599 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2600 }
2601 }
2602 break;
2603 case NSPACEL:
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 if (data->start_class->flags & ANYOF_LOCALE) {
2606 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2607 for (value = 0; value < 256; value++)
2608 if (!isSPACE(value))
2609 ANYOF_BITMAP_CLEAR(data->start_class, value);
2610 }
2611 }
2612 else {
2613 data->start_class->flags |= ANYOF_LOCALE;
2614 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2615 }
2616 break;
2617 case DIGIT:
2618 if (flags & SCF_DO_STCLASS_AND) {
2619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2620 for (value = 0; value < 256; value++)
2621 if (!isDIGIT(value))
2622 ANYOF_BITMAP_CLEAR(data->start_class, value);
2623 }
2624 else {
2625 if (data->start_class->flags & ANYOF_LOCALE)
2626 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2627 else {
2628 for (value = 0; value < 256; value++)
2629 if (isDIGIT(value))
b81d288d 2630 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2631 }
2632 }
2633 break;
2634 case NDIGIT:
2635 if (flags & SCF_DO_STCLASS_AND) {
2636 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2637 for (value = 0; value < 256; value++)
2638 if (isDIGIT(value))
2639 ANYOF_BITMAP_CLEAR(data->start_class, value);
2640 }
2641 else {
2642 if (data->start_class->flags & ANYOF_LOCALE)
2643 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2644 else {
2645 for (value = 0; value < 256; value++)
2646 if (!isDIGIT(value))
b81d288d 2647 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2648 }
2649 }
2650 break;
2651 }
2652 if (flags & SCF_DO_STCLASS_OR)
2653 cl_and(data->start_class, &and_with);
2654 flags &= ~SCF_DO_STCLASS;
2655 }
a0ed51b3 2656 }
22c35a8c 2657 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2658 data->flags |= (OP(scan) == MEOL
2659 ? SF_BEFORE_MEOL
2660 : SF_BEFORE_SEOL);
a0ed51b3 2661 }
653099ff
GS
2662 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2663 /* Lookbehind, or need to calculate parens/evals/stclass: */
2664 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2665 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2666 /* Lookahead/lookbehind */
cb434fcc 2667 I32 deltanext, minnext, fake = 0;
c277df42 2668 regnode *nscan;
653099ff
GS
2669 struct regnode_charclass_class intrnl;
2670 int f = 0;
c277df42
IZ
2671
2672 data_fake.flags = 0;
b81d288d 2673 if (data) {
2c2d71f5 2674 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2675 data_fake.last_closep = data->last_closep;
2676 }
2677 else
2678 data_fake.last_closep = &fake;
653099ff
GS
2679 if ( flags & SCF_DO_STCLASS && !scan->flags
2680 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2681 cl_init(pRExC_state, &intrnl);
653099ff 2682 data_fake.start_class = &intrnl;
e1901655 2683 f |= SCF_DO_STCLASS_AND;
653099ff 2684 }
e1901655
IZ
2685 if (flags & SCF_WHILEM_VISITED_POS)
2686 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2687 next = regnext(scan);
2688 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2689 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2690 if (scan->flags) {
2691 if (deltanext) {
9baa0206 2692 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2693 }
2694 else if (minnext > U8_MAX) {
9baa0206 2695 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2696 }
eb160463 2697 scan->flags = (U8)minnext;
c277df42
IZ
2698 }
2699 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2700 pars++;
405ff068 2701 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2702 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2703 if (data)
2704 data->whilem_c = data_fake.whilem_c;
e1901655 2705 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
2706 int was = (data->start_class->flags & ANYOF_EOS);
2707
2708 cl_and(data->start_class, &intrnl);
2709 if (was)
2710 data->start_class->flags |= ANYOF_EOS;
2711 }
a0ed51b3
LW
2712 }
2713 else if (OP(scan) == OPEN) {
c277df42 2714 pars++;
a0ed51b3 2715 }
cb434fcc 2716 else if (OP(scan) == CLOSE) {
eb160463 2717 if ((I32)ARG(scan) == is_par) {
cb434fcc 2718 next = regnext(scan);
c277df42 2719
cb434fcc
IZ
2720 if ( next && (OP(next) != WHILEM) && next < last)
2721 is_par = 0; /* Disable optimization */
2722 }
2723 if (data)
2724 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2725 }
2726 else if (OP(scan) == EVAL) {
c277df42
IZ
2727 if (data)
2728 data->flags |= SF_HAS_EVAL;
2729 }
96776eda 2730 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2731 if (flags & SCF_DO_SUBSTR) {
830247a4 2732 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2733 data->longest = &(data->longest_float);
2734 }
2735 is_inf = is_inf_internal = 1;
653099ff 2736 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2737 cl_anything(pRExC_state, data->start_class);
96776eda 2738 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2739 }
c277df42
IZ
2740 /* Else: zero-length, ignore. */
2741 scan = regnext(scan);
2742 }
2743
2744 finish:
2745 *scanp = scan;
aca2d497 2746 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2747 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2748 data->pos_delta = I32_MAX - data->pos_min;
2749 if (is_par > U8_MAX)
2750 is_par = 0;
2751 if (is_par && pars==1 && data) {
2752 data->flags |= SF_IN_PAR;
2753 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2754 }
2755 else if (pars && data) {
c277df42
IZ
2756 data->flags |= SF_HAS_PAR;
2757 data->flags &= ~SF_IN_PAR;
2758 }
653099ff
GS
2759 if (flags & SCF_DO_STCLASS_OR)
2760 cl_and(data->start_class, &and_with);
c277df42
IZ
2761 return min;
2762}
2763
76e3520e 2764STATIC I32
bfed75c6 2765S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2766{
830247a4 2767 if (RExC_rx->data) {
b81d288d
AB
2768 Renewc(RExC_rx->data,
2769 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2770 char, struct reg_data);
830247a4
IZ
2771 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2772 RExC_rx->data->count += n;
a0ed51b3
LW
2773 }
2774 else {
830247a4 2775 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2776 char, struct reg_data);
830247a4
IZ
2777 New(1208, RExC_rx->data->what, n, U8);
2778 RExC_rx->data->count = n;
c277df42 2779 }
830247a4
IZ
2780 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2781 return RExC_rx->data->count - n;
c277df42
IZ
2782}
2783
d88dccdf 2784void
864dbfa3 2785Perl_reginitcolors(pTHX)
d88dccdf 2786{
1df70142 2787 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2788 if (s) {
1df70142
AL
2789 char *t = savepv(s);
2790 int i = 0;
2791 PL_colors[0] = t;
d88dccdf 2792 while (++i < 6) {
1df70142
AL
2793 t = strchr(t, '\t');
2794 if (t) {
2795 *t = '\0';
2796 PL_colors[i] = ++t;
d88dccdf
IZ
2797 }
2798 else
1df70142 2799 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2800 }
2801 } else {
1df70142 2802 int i = 0;
b81d288d 2803 while (i < 6)
06b5626a 2804 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2805 }
2806 PL_colorset = 1;
2807}
2808
8615cb43 2809
a687059c 2810/*
e50aee73 2811 - pregcomp - compile a regular expression into internal code
a687059c
LW
2812 *
2813 * We can't allocate space until we know how big the compiled form will be,
2814 * but we can't compile it (and thus know how big it is) until we've got a
2815 * place to put the code. So we cheat: we compile it twice, once with code
2816 * generation turned off and size counting turned on, and once "for real".
2817 * This also means that we don't allocate space until we are sure that the
2818 * thing really will compile successfully, and we never have to move the
2819 * code and thus invalidate pointers into it. (Note that it has to be in
2820 * one piece because free() must be able to free it all.) [NB: not true in perl]
2821 *
2822 * Beware that the optimization-preparation code in here knows about some
2823 * of the structure of the compiled regexp. [I'll say.]
2824 */
2825regexp *
864dbfa3 2826Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2827{
a0d0e21e 2828 register regexp *r;
c277df42 2829 regnode *scan;
c277df42 2830 regnode *first;
a0d0e21e 2831 I32 flags;
a0d0e21e
LW
2832 I32 minlen = 0;
2833 I32 sawplus = 0;
2834 I32 sawopen = 0;
2c2d71f5 2835 scan_data_t data;
830247a4
IZ
2836 RExC_state_t RExC_state;
2837 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2838
a3621e74
YO
2839 GET_RE_DEBUG_FLAGS_DECL;
2840
a0d0e21e 2841 if (exp == NULL)
c277df42 2842 FAIL("NULL regexp argument");
a0d0e21e 2843
a5961de5 2844 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2845
5cfc7842 2846 RExC_precomp = exp;
a3621e74
YO
2847 DEBUG_r(if (!PL_colorset) reginitcolors());
2848 DEBUG_COMPILE_r({
2849 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2850 PL_colors[4],PL_colors[5],PL_colors[0],
2851 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2852 });
e2509266 2853 RExC_flags = pm->op_pmflags;
830247a4 2854 RExC_sawback = 0;
bbce6d69 2855
830247a4
IZ
2856 RExC_seen = 0;
2857 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2858 RExC_seen_evals = 0;
2859 RExC_extralen = 0;
c277df42 2860
bbce6d69 2861 /* First pass: determine size, legality. */
830247a4 2862 RExC_parse = exp;
fac92740 2863 RExC_start = exp;
830247a4
IZ
2864 RExC_end = xend;
2865 RExC_naughty = 0;
2866 RExC_npar = 1;
2867 RExC_size = 0L;
2868 RExC_emit = &PL_regdummy;
2869 RExC_whilem_seen = 0;
85ddcde9
JH
2870#if 0 /* REGC() is (currently) a NOP at the first pass.
2871 * Clever compilers notice this and complain. --jhi */
830247a4 2872 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2873#endif
830247a4 2874 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2875 RExC_precomp = Nullch;
a0d0e21e
LW
2876 return(NULL);
2877 }
a3621e74 2878 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2879
c277df42
IZ
2880 /* Small enough for pointer-storage convention?
2881 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2882 if (RExC_size >= 0x10000L && RExC_extralen)
2883 RExC_size += RExC_extralen;
c277df42 2884 else
830247a4
IZ
2885 RExC_extralen = 0;
2886 if (RExC_whilem_seen > 15)
2887 RExC_whilem_seen = 15;
a0d0e21e 2888
bbce6d69 2889 /* Allocate space and initialize. */
830247a4 2890 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2891 char, regexp);
a0d0e21e 2892 if (r == NULL)
b45f050a
JF
2893 FAIL("Regexp out of space");
2894
0f79a09d
GS
2895#ifdef DEBUGGING
2896 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2897 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2898#endif
c277df42 2899 r->refcnt = 1;
bbce6d69 2900 r->prelen = xend - exp;
5cfc7842 2901 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2902 r->subbeg = NULL;
f8c7b90f 2903#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2904 r->saved_copy = Nullsv;
2905#endif
cf93c79d 2906 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2907 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2908
2909 r->substrs = 0; /* Useful during FAIL. */
2910 r->startp = 0; /* Useful during FAIL. */
2911 r->endp = 0; /* Useful during FAIL. */
2912
fac92740
MJD
2913 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2914 if (r->offsets) {
2af232bd 2915 r->offsets[0] = RExC_size;
fac92740 2916 }
a3621e74 2917 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2918 "%s %"UVuf" bytes for offset annotations.\n",
2919 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2920 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2921
830247a4 2922 RExC_rx = r;
bbce6d69 2923
2924 /* Second pass: emit code. */
e2509266 2925 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2926 RExC_parse = exp;
2927 RExC_end = xend;
2928 RExC_naughty = 0;
2929 RExC_npar = 1;
fac92740 2930 RExC_emit_start = r->program;
830247a4 2931 RExC_emit = r->program;
2cd61cdb 2932 /* Store the count of eval-groups for security checks: */
eb160463 2933 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2934 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2935 r->data = 0;
830247a4 2936 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2937 return(NULL);
2938
a3621e74 2939
a0d0e21e 2940 /* Dig out information for optimizations. */
cf93c79d 2941 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2942 pm->op_pmflags = RExC_flags;
a0ed51b3 2943 if (UTF)
5ff6fc6d 2944 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2945 r->regstclass = NULL;
830247a4 2946 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2947 r->reganch |= ROPT_NAUGHTY;
c277df42 2948 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2949
2950 /* XXXX To minimize changes to RE engine we always allocate
2951 3-units-long substrs field. */
2952 Newz(1004, r->substrs, 1, struct reg_substr_data);
2953
2c2d71f5 2954 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2955 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2956 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2957 I32 fake;
c5254dd6 2958 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2959 struct regnode_charclass_class ch_class;
2960 int stclass_flag;
cb434fcc 2961 I32 last_close = 0;
a0d0e21e
LW
2962
2963 first = scan;
c277df42 2964 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2965 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2966 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2967 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2968 (OP(first) == PLUS) ||
2969 (OP(first) == MINMOD) ||
653099ff 2970 /* An {n,m} with n>0 */
22c35a8c 2971 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2972 if (OP(first) == PLUS)
2973 sawplus = 1;
2974 else
2975 first += regarglen[(U8)OP(first)];
2976 first = NEXTOPER(first);
a687059c
LW
2977 }
2978
a0d0e21e
LW
2979 /* Starting-point info. */
2980 again:
653099ff 2981 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2982 if (OP(first) == EXACT)
2983 ; /* Empty, get anchored substr later. */
2984 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2985 r->regstclass = first;
2986 }
bfed75c6 2987 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2988 r->regstclass = first;
22c35a8c
GS
2989 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2990 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2991 r->regstclass = first;
22c35a8c 2992 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2993 r->reganch |= (OP(first) == MBOL
2994 ? ROPT_ANCH_MBOL
2995 : (OP(first) == SBOL
2996 ? ROPT_ANCH_SBOL
2997 : ROPT_ANCH_BOL));
a0d0e21e 2998 first = NEXTOPER(first);
774d564b 2999 goto again;
3000 }
3001 else if (OP(first) == GPOS) {
3002 r->reganch |= ROPT_ANCH_GPOS;
3003 first = NEXTOPER(first);
3004 goto again;
a0d0e21e 3005 }
e09294f4 3006 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 3007 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3008 !(r->reganch & ROPT_ANCH) )
3009 {
3010 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3011 const int type =
3012 (OP(NEXTOPER(first)) == REG_ANY)
3013 ? ROPT_ANCH_MBOL
3014 : ROPT_ANCH_SBOL;
cad2e5aa 3015 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3016 first = NEXTOPER(first);
774d564b 3017 goto again;
a0d0e21e 3018 }
b81d288d 3019 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3020 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3021 /* x+ must match at the 1st pos of run of x's */
3022 r->reganch |= ROPT_SKIP;
a0d0e21e 3023
c277df42 3024 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3025 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3026 (IV)(first - scan + 1)));
a0d0e21e
LW
3027 /*
3028 * If there's something expensive in the r.e., find the
3029 * longest literal string that must appear and make it the
3030 * regmust. Resolve ties in favor of later strings, since
3031 * the regstart check works with the beginning of the r.e.
3032 * and avoiding duplication strengthens checking. Not a
3033 * strong reason, but sufficient in the absence of others.
3034 * [Now we resolve ties in favor of the earlier string if
c277df42 3035 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3036 * earlier string may buy us something the later one won't.]
3037 */
a0d0e21e 3038 minlen = 0;
a687059c 3039
79cb57f6
GS
3040 data.longest_fixed = newSVpvn("",0);
3041 data.longest_float = newSVpvn("",0);
3042 data.last_found = newSVpvn("",0);
c277df42
IZ
3043 data.longest = &(data.longest_fixed);
3044 first = scan;
653099ff 3045 if (!r->regstclass) {
830247a4 3046 cl_init(pRExC_state, &ch_class);
653099ff
GS
3047 data.start_class = &ch_class;
3048 stclass_flag = SCF_DO_STCLASS_AND;
3049 } else /* XXXX Check for BOUND? */
3050 stclass_flag = 0;
cb434fcc 3051 data.last_closep = &last_close;
653099ff 3052
830247a4 3053 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3054 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3055 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3056 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3057 && !RExC_seen_zerolen
3058 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3059 r->reganch |= ROPT_CHECK_ALL;
830247a4 3060 scan_commit(pRExC_state, &data);
c277df42
IZ
3061 SvREFCNT_dec(data.last_found);
3062
a0ed51b3 3063 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3064 if (longest_float_length
c277df42
IZ
3065 || (data.flags & SF_FL_BEFORE_EOL
3066 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3067 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3068 int t;
3069
a0ed51b3 3070 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3071 && data.offset_fixed == data.offset_float_min
3072 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3073 goto remove_float; /* As in (a)+. */
3074
33b8afdf
JH
3075 if (SvUTF8(data.longest_float)) {
3076 r->float_utf8 = data.longest_float;
3077 r->float_substr = Nullsv;
3078 } else {
3079 r->float_substr = data.longest_float;
3080 r->float_utf8 = Nullsv;
3081 }
c277df42
IZ
3082 r->float_min_offset = data.offset_float_min;
3083 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3084 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3087 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3088 }
3089 else {
aca2d497 3090 remove_float:
33b8afdf 3091 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3092 SvREFCNT_dec(data.longest_float);
c5254dd6 3093 longest_float_length = 0;
a0d0e21e 3094 }
c277df42 3095
a0ed51b3 3096 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3097 if (longest_fixed_length
c277df42
IZ
3098 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3099 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3100 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3101 int t;
3102
33b8afdf
JH
3103 if (SvUTF8(data.longest_fixed)) {
3104 r->anchored_utf8 = data.longest_fixed;
3105 r->anchored_substr = Nullsv;
3106 } else {
3107 r->anchored_substr = data.longest_fixed;
3108 r->anchored_utf8 = Nullsv;
3109 }
c277df42 3110 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3111 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3112 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3113 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3114 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3115 }
3116 else {
33b8afdf 3117 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3118 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3119 longest_fixed_length = 0;
a0d0e21e 3120 }
b81d288d 3121 if (r->regstclass
ffc61ed2 3122 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3123 r->regstclass = NULL;
33b8afdf
JH
3124 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3125 && stclass_flag
653099ff 3126 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3127 && !cl_is_anything(data.start_class))
3128 {
1df70142 3129 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3130
b81d288d 3131 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3132 struct regnode_charclass_class);
3133 StructCopy(data.start_class,
830247a4 3134 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3135 struct regnode_charclass_class);
830247a4 3136 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3137 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3138 PL_regdata = r->data; /* for regprop() */
a3621e74 3139 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3140 regprop(sv, (regnode*)data.start_class);
3141 PerlIO_printf(Perl_debug_log,
a0288114 3142 "synthetic stclass \"%s\".\n",
3f7c398e 3143 SvPVX_const(sv));});
653099ff 3144 }
c277df42
IZ
3145
3146 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3147 if (longest_fixed_length > longest_float_length) {
c277df42 3148 r->check_substr = r->anchored_substr;
33b8afdf 3149 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3150 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3151 if (r->reganch & ROPT_ANCH_SINGLE)
3152 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3153 }
3154 else {
c277df42 3155 r->check_substr = r->float_substr;
33b8afdf 3156 r->check_utf8 = r->float_utf8;
c277df42
IZ
3157 r->check_offset_min = data.offset_float_min;
3158 r->check_offset_max = data.offset_float_max;
a0d0e21e 3159 }
30382c73
IZ
3160 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3161 This should be changed ASAP! */
33b8afdf 3162 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3163 r->reganch |= RE_USE_INTUIT;
33b8afdf 3164 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3165 r->reganch |= RE_INTUIT_TAIL;
3166 }
a0ed51b3
LW
3167 }
3168 else {
c277df42
IZ
3169 /* Several toplevels. Best we can is to set minlen. */
3170 I32 fake;
653099ff 3171 struct regnode_charclass_class ch_class;
cb434fcc 3172 I32 last_close = 0;
c277df42 3173
a3621e74 3174 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3175 scan = r->program + 1;
830247a4 3176 cl_init(pRExC_state, &ch_class);
653099ff 3177 data.start_class = &ch_class;
cb434fcc 3178 data.last_closep = &last_close;
a3621e74 3179 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3180 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3181 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3182 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3183 && !cl_is_anything(data.start_class))
3184 {
1df70142 3185 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3186
b81d288d 3187 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3188 struct regnode_charclass_class);
3189 StructCopy(data.start_class,
830247a4 3190 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3191 struct regnode_charclass_class);
830247a4 3192 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3193 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3194 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3195 regprop(sv, (regnode*)data.start_class);
3196 PerlIO_printf(Perl_debug_log,
a0288114 3197 "synthetic stclass \"%s\".\n",
3f7c398e 3198 SvPVX_const(sv));});
653099ff 3199 }
a0d0e21e
LW
3200 }
3201
a0d0e21e 3202 r->minlen = minlen;
b81d288d 3203 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3204 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3205 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3206 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3207 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3208 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3209 if (RExC_seen & REG_SEEN_CANY)
3210 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3211 Newz(1002, r->startp, RExC_npar, I32);
3212 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3213 PL_regdata = r->data; /* for regprop() */
a3621e74 3214 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3215 return(r);
a687059c
LW
3216}
3217
3218/*
3219 - reg - regular expression, i.e. main body or parenthesized thing
3220 *
3221 * Caller must absorb opening parenthesis.
3222 *
3223 * Combining parenthesis handling with the base level of regular expression
3224 * is a trifle forced, but the need to tie the tails of the branches to what
3225 * follows makes it hard to avoid.
3226 */
76e3520e 3227STATIC regnode *
830247a4 3228S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3229 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3230{
27da23d5 3231 dVAR;
c277df42
IZ
3232 register regnode *ret; /* Will be the head of the group. */
3233 register regnode *br;
3234 register regnode *lastbr;
3235 register regnode *ender = 0;
a0d0e21e 3236 register I32 parno = 0;
e2509266 3237 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3238
3239 /* for (?g), (?gc), and (?o) warnings; warning
3240 about (?c) will warn about (?g) -- japhy */
3241
3242 I32 wastedflags = 0x00,
3243 wasted_o = 0x01,
3244 wasted_g = 0x02,
3245 wasted_gc = 0x02 | 0x04,
3246 wasted_c = 0x04;
3247
fac92740 3248 char * parse_start = RExC_parse; /* MJD */
830247a4 3249 char *oregcomp_parse = RExC_parse;
c277df42 3250 char c;
a0d0e21e 3251
821b33a5 3252 *flagp = 0; /* Tentatively. */
a0d0e21e 3253
9d1d55b5 3254
a0d0e21e
LW
3255 /* Make an OPEN node, if parenthesized. */
3256 if (paren) {
fac92740 3257 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3258 U32 posflags = 0, negflags = 0;
3259 U32 *flagsp = &posflags;
0f5d15d6 3260 int logical = 0;
830247a4 3261 char *seqstart = RExC_parse;
ca9dfc88 3262
830247a4
IZ
3263 RExC_parse++;
3264 paren = *RExC_parse++;
c277df42 3265 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3266 switch (paren) {
fac92740 3267 case '<': /* (?<...) */
830247a4 3268 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3269 if (*RExC_parse == '!')
c277df42 3270 paren = ',';
b81d288d 3271 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3272 goto unknown;
830247a4 3273 RExC_parse++;
fac92740
MJD
3274 case '=': /* (?=...) */
3275 case '!': /* (?!...) */
830247a4 3276 RExC_seen_zerolen++;
fac92740
MJD
3277 case ':': /* (?:...) */
3278 case '>': /* (?>...) */
a0d0e21e 3279 break;
fac92740
MJD
3280 case '$': /* (?$...) */
3281 case '@': /* (?@...) */
8615cb43 3282 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3283 break;
fac92740 3284 case '#': /* (?#...) */
830247a4
IZ
3285 while (*RExC_parse && *RExC_parse != ')')
3286 RExC_parse++;
3287 if (*RExC_parse != ')')
c277df42 3288 FAIL("Sequence (?#... not terminated");
830247a4 3289 nextchar(pRExC_state);
a0d0e21e
LW
3290 *flagp = TRYAGAIN;
3291 return NULL;
fac92740 3292 case 'p': /* (?p...) */
9014280d 3293 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3294 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3295 /* FALL THROUGH*/
fac92740 3296 case '?': /* (??...) */
0f5d15d6 3297 logical = 1;
438a3801
YST
3298 if (*RExC_parse != '{')
3299 goto unknown;
830247a4 3300 paren = *RExC_parse++;
0f5d15d6 3301 /* FALL THROUGH */
fac92740 3302 case '{': /* (?{...}) */
c277df42 3303 {
c277df42
IZ
3304 I32 count = 1, n = 0;
3305 char c;
830247a4 3306 char *s = RExC_parse;
c277df42
IZ
3307 SV *sv;
3308 OP_4tree *sop, *rop;
3309
830247a4
IZ
3310 RExC_seen_zerolen++;
3311 RExC_seen |= REG_SEEN_EVAL;
3312 while (count && (c = *RExC_parse)) {
3313 if (c == '\\' && RExC_parse[1])
3314 RExC_parse++;
b81d288d 3315 else if (c == '{')
c277df42 3316 count++;
b81d288d 3317 else if (c == '}')
c277df42 3318 count--;
830247a4 3319 RExC_parse++;
c277df42 3320 }
830247a4 3321 if (*RExC_parse != ')')
b45f050a 3322 {
b81d288d 3323 RExC_parse = s;
b45f050a
JF
3324 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3325 }
c277df42 3326 if (!SIZE_ONLY) {
f3548bdc 3327 PAD *pad;
b81d288d
AB
3328
3329 if (RExC_parse - 1 - s)
830247a4 3330 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3331 else
79cb57f6 3332 sv = newSVpvn("", 0);
c277df42 3333
569233ed
SB
3334 ENTER;
3335 Perl_save_re_context(aTHX);
f3548bdc 3336 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3337 sop->op_private |= OPpREFCOUNTED;
3338 /* re_dup will OpREFCNT_inc */
3339 OpREFCNT_set(sop, 1);
569233ed 3340 LEAVE;
c277df42 3341
830247a4
IZ
3342 n = add_data(pRExC_state, 3, "nop");
3343 RExC_rx->data->data[n] = (void*)rop;
3344 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3345 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3346 SvREFCNT_dec(sv);
a0ed51b3 3347 }
e24b16f9 3348 else { /* First pass */
830247a4 3349 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3350 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3351 /* No compiled RE interpolated, has runtime
3352 components ===> unsafe. */
3353 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3354 if (PL_tainting && PL_tainted)
cc6b7395 3355 FAIL("Eval-group in insecure regular expression");
923e4eb5 3356 if (IN_PERL_COMPILETIME)
b5c19bd7 3357 PL_cv_has_eval = 1;
c277df42 3358 }
b5c19bd7 3359
830247a4 3360 nextchar(pRExC_state);
0f5d15d6 3361 if (logical) {
830247a4 3362 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3363 if (!SIZE_ONLY)
3364 ret->flags = 2;
830247a4 3365 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3366 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3367 return ret;
3368 }
ccb2c380
MP
3369 ret = reganode(pRExC_state, EVAL, n);
3370 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3371 Set_Node_Offset(ret, parse_start);
3372 return ret;
c277df42 3373 }
fac92740 3374 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3375 {
fac92740 3376 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3377 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3378 || RExC_parse[1] == '<'
830247a4 3379 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3380 I32 flag;
3381
830247a4 3382 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3383 if (!SIZE_ONLY)
3384 ret->flags = 1;
830247a4 3385 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3386 goto insert_if;
b81d288d 3387 }
a0ed51b3 3388 }
830247a4 3389 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3390 /* (?(1)...) */
830247a4 3391 parno = atoi(RExC_parse++);
c277df42 3392
830247a4
IZ
3393 while (isDIGIT(*RExC_parse))
3394 RExC_parse++;
fac92740 3395 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3396
830247a4 3397 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3398 vFAIL("Switch condition not recognized");
c277df42 3399 insert_if:
830247a4
IZ
3400 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3401 br = regbranch(pRExC_state, &flags, 1);
c277df42 3402 if (br == NULL)
830247a4 3403 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3404 else
830247a4
IZ
3405 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3406 c = *nextchar(pRExC_state);
d1b80229
IZ
3407 if (flags&HASWIDTH)
3408 *flagp |= HASWIDTH;
c277df42 3409 if (c == '|') {
830247a4
IZ
3410 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3411 regbranch(pRExC_state, &flags, 1);
3412 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3413 if (flags&HASWIDTH)
3414 *flagp |= HASWIDTH;
830247a4 3415 c = *nextchar(pRExC_state);
a0ed51b3
LW
3416 }
3417 else
c277df42
IZ
3418 lastbr = NULL;
3419 if (c != ')')
8615cb43 3420 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3421 ender = reg_node(pRExC_state, TAIL);
3422 regtail(pRExC_state, br, ender);
c277df42 3423 if (lastbr) {
830247a4
IZ
3424 regtail(pRExC_state, lastbr, ender);
3425 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3426 }
3427 else
830247a4 3428 regtail(pRExC_state, ret, ender);
c277df42 3429 return ret;
a0ed51b3
LW
3430 }
3431 else {
830247a4 3432 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3433 }
3434 }
1b1626e4 3435 case 0:
830247a4 3436 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3437 vFAIL("Sequence (? incomplete");
1b1626e4 3438 break;
a0d0e21e 3439 default:
830247a4 3440 --RExC_parse;
fac92740 3441 parse_flags: /* (?i) */
830247a4 3442 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3443 /* (?g), (?gc) and (?o) are useless here
3444 and must be globally applied -- japhy */
3445
3446 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3447 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3448 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3449 if (! (wastedflags & wflagbit) ) {
3450 wastedflags |= wflagbit;
3451 vWARN5(
3452 RExC_parse + 1,
3453 "Useless (%s%c) - %suse /%c modifier",
3454 flagsp == &negflags ? "?-" : "?",
3455 *RExC_parse,
3456 flagsp == &negflags ? "don't " : "",
3457 *RExC_parse
3458 );
3459 }
3460 }
3461 }
3462 else if (*RExC_parse == 'c') {
3463 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3464 if (! (wastedflags & wasted_c) ) {
3465 wastedflags |= wasted_gc;
3466 vWARN3(
3467 RExC_parse + 1,
3468 "Useless (%sc) - %suse /gc modifier",
3469 flagsp == &negflags ? "?-" : "?",
3470 flagsp == &negflags ? "don't " : ""
3471 );
3472 }
3473 }
3474 }
3475 else { pmflag(flagsp, *RExC_parse); }
3476
830247a4 3477 ++RExC_parse;
ca9dfc88 3478 }
830247a4 3479 if (*RExC_parse == '-') {
ca9dfc88 3480 flagsp = &negflags;
9d1d55b5 3481 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3482 ++RExC_parse;
ca9dfc88 3483 goto parse_flags;
48c036b1 3484 }
e2509266
JH
3485 RExC_flags |= posflags;
3486 RExC_flags &= ~negflags;
830247a4
IZ
3487 if (*RExC_parse == ':') {
3488 RExC_parse++;
ca9dfc88
IZ
3489 paren = ':';
3490 break;
3491 }
c277df42 3492 unknown:
830247a4
IZ
3493 if (*RExC_parse != ')') {
3494 RExC_parse++;
3495 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3496 }
830247a4 3497 nextchar(pRExC_state);
a0d0e21e
LW
3498 *flagp = TRYAGAIN;
3499 return NULL;
3500 }
3501 }
fac92740 3502 else { /* (...) */
830247a4
IZ
3503 parno = RExC_npar;
3504 RExC_npar++;
3505 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3506 Set_Node_Length(ret, 1); /* MJD */
3507 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3508 open = 1;
a0d0e21e 3509 }
a0ed51b3 3510 }
fac92740 3511 else /* ! paren */
a0d0e21e
LW
3512 ret = NULL;
3513
3514 /* Pick up the branches, linking them together. */
fac92740 3515 parse_start = RExC_parse; /* MJD */
830247a4 3516 br = regbranch(pRExC_state, &flags, 1);
fac92740 3517 /* branch_len = (paren != 0); */
2af232bd 3518
a0d0e21e
LW
3519 if (br == NULL)
3520 return(NULL);
830247a4
IZ
3521 if (*RExC_parse == '|') {
3522 if (!SIZE_ONLY && RExC_extralen) {
3523 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3524 }
fac92740 3525 else { /* MJD */
830247a4 3526 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3527 Set_Node_Length(br, paren != 0);
3528 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3529 }
c277df42
IZ
3530 have_branch = 1;
3531 if (SIZE_ONLY)
830247a4 3532 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3533 }
3534 else if (paren == ':') {
c277df42
IZ
3535 *flagp |= flags&SIMPLE;
3536 }
3537 if (open) { /* Starts with OPEN. */
830247a4 3538 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3539 }
3540 else if (paren != '?') /* Not Conditional */
a0d0e21e 3541 ret = br;
32a0ca98 3542 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3543 lastbr = br;
830247a4
IZ
3544 while (*RExC_parse == '|') {
3545 if (!SIZE_ONLY && RExC_extralen) {
3546 ender = reganode(pRExC_state, LONGJMP,0);
3547 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3548 }
3549 if (SIZE_ONLY)
830247a4
IZ
3550 RExC_extralen += 2; /* Account for LONGJMP. */
3551 nextchar(pRExC_state);
3552 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3553
a687059c 3554 if (br == NULL)
a0d0e21e 3555 return(NULL);
830247a4 3556 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3557 lastbr = br;
821b33a5
IZ
3558 if (flags&HASWIDTH)
3559 *flagp |= HASWIDTH;
a687059c 3560 *flagp |= flags&SPSTART;
a0d0e21e
LW
3561 }
3562
c277df42
IZ
3563 if (have_branch || paren != ':') {
3564 /* Make a closing node, and hook it on the end. */
3565 switch (paren) {
3566 case ':':
830247a4 3567 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3568 break;
3569 case 1:
830247a4 3570 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3571 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3572 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3573 break;
3574 case '<':
c277df42
IZ
3575 case ',':
3576 case '=':
3577 case '!':
c277df42 3578 *flagp &= ~HASWIDTH;
821b33a5
IZ
3579 /* FALL THROUGH */
3580 case '>':
830247a4 3581 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3582 break;
3583 case 0:
830247a4 3584 ender = reg_node(pRExC_state, END);
c277df42
IZ
3585 break;
3586 }
830247a4 3587 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3588
c277df42
IZ
3589 if (have_branch) {
3590 /* Hook the tails of the branches to the closing node. */
3591 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3592 regoptail(pRExC_state, br, ender);
c277df42
IZ
3593 }
3594 }
a0d0e21e 3595 }
c277df42
IZ
3596
3597 {
e1ec3a88
AL
3598 const char *p;
3599 static const char parens[] = "=!<,>";
c277df42
IZ
3600
3601 if (paren && (p = strchr(parens, paren))) {
eb160463 3602 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3603 int flag = (p - parens) > 1;
3604
3605 if (paren == '>')
3606 node = SUSPEND, flag = 0;
830247a4 3607 reginsert(pRExC_state, node,ret);
45948336
EP
3608 Set_Node_Cur_Length(ret);
3609 Set_Node_Offset(ret, parse_start + 1);
c277df42 3610 ret->flags = flag;
830247a4 3611 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3612 }
a0d0e21e
LW
3613 }
3614
3615 /* Check for proper termination. */
ce3e6498 3616 if (paren) {
e2509266 3617 RExC_flags = oregflags;
830247a4
IZ
3618 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3619 RExC_parse = oregcomp_parse;
380a0633 3620 vFAIL("Unmatched (");
ce3e6498 3621 }
a0ed51b3 3622 }
830247a4
IZ
3623 else if (!paren && RExC_parse < RExC_end) {
3624 if (*RExC_parse == ')') {
3625 RExC_parse++;
380a0633 3626 vFAIL("Unmatched )");
a0ed51b3
LW
3627 }
3628 else
b45f050a 3629 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3630 /* NOTREACHED */
3631 }
a687059c 3632
a0d0e21e 3633 return(ret);
a687059c
LW
3634}
3635
3636/*
3637 - regbranch - one alternative of an | operator
3638 *
3639 * Implements the concatenation operator.
3640 */
76e3520e 3641STATIC regnode *
830247a4 3642S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3643{
c277df42
IZ
3644 register regnode *ret;
3645 register regnode *chain = NULL;
3646 register regnode *latest;
3647 I32 flags = 0, c = 0;
a0d0e21e 3648
b81d288d 3649 if (first)
c277df42
IZ
3650 ret = NULL;
3651 else {
b81d288d 3652 if (!SIZE_ONLY && RExC_extralen)
830247a4 3653 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3654 else {
830247a4 3655 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3656 Set_Node_Length(ret, 1);
3657 }
c277df42
IZ
3658 }
3659
b81d288d 3660 if (!first && SIZE_ONLY)
830247a4 3661 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3662