This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
simple optimization for SelectSaver
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
b81d288d 209static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 210 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380 270#define FAIL(msg) STMT_START { \
bfed75c6 271 const char *ellipses = ""; \
ccb2c380
MP
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380 290#define FAIL2(pat,msg) STMT_START { \
bfed75c6 291 const char *ellipses = ""; \
ccb2c380
MP
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
a3621e74 431/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
a0ed51b3
LW
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c 663/*
a3621e74
YO
664
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
673
674Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
675
676A trie is an N'ary tree where the branches are determined by digital
677decomposition of the key. IE, at the root node you look up the 1st character and
678follow that branch repeat until you find the end of the branches. Nodes can be
679marked as "accepting" meaning they represent a complete word. Eg:
680
681 /he|she|his|hers/
682
683would convert into the following structure. Numbers represent states, letters
684following numbers represent valid transitions on the letter from that state, if
685the number is in square brackets it represents an accepting state, otherwise it
686will be in parenthesis.
687
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
689 | |
690 | (2)
691 | |
692 (1) +-i->(6)-+-s->[7]
693 |
694 +-s->(3)-+-h->(4)-+-e->[5]
695
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
697
698This shows that when matching against the string 'hers' we will begin at state 1
699read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702single traverse. We store a mapping from accepting to state to which word was
703matched, and then when we have multiple possibilities we try to complete the
704rest of the regex in the order in which they occured in the alternation.
705
706The only prior NFA like behaviour that would be changed by the TRIE support is
707the silent ignoring of duplicate alternations which are of the form:
708
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
710
711Thus EVAL blocks follwing a trie may be called a different number of times with
712and without the optimisation. With the optimisations dupes will be silently
713ignored. This inconsistant behaviour of EVAL type nodes is well established as
714the following demonstrates:
715
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
717
718which prints out 'word' three times, but
719
720 'words'=~/(word|word|word)(?{ print $1 })S/
721
722which doesnt print it out at all. This is due to other optimisations kicking in.
723
724Example of what happens on a structural level:
725
726The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
727
728 1: CURLYM[1] {1,32767}(18)
729 5: BRANCH(8)
730 6: EXACT <ac>(16)
731 8: BRANCH(11)
732 9: EXACT <ad>(16)
733 11: BRANCH(14)
734 12: EXACT <ab>(16)
735 16: SUCCEED(0)
736 17: NOTHING(18)
737 18: END(0)
738
739This would be optimizable with startbranch=5, first=5, last=16, tail=16
740and should turn into:
741
742 1: CURLYM[1] {1,32767}(18)
743 5: TRIE(16)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
745 <ac>
746 <ad>
747 <ab>
748 16: SUCCEED(0)
749 17: NOTHING(18)
750 18: END(0)
751
752Cases where tail != last would be like /(?foo|bar)baz/:
753
754 1: BRANCH(4)
755 2: EXACT <foo>(8)
756 4: BRANCH(7)
757 5: EXACT <bar>(8)
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762which would be optimizable with startbranch=1, first=1, last=7, tail=8
763and would end up looking like:
764
765 1: TRIE(8)
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
767 <foo>
768 <bar>
769 7: TAIL(8)
770 8: EXACT <baz>(10)
771 10: END(0)
772
773*/
774
775#define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
777 SV *tmp; \
778 if ( UTF ) { \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
781 } else { \
782 tmp = Perl_newSVpvf_nocontext( "%c", uvc ); \
783 } \
784 av_push( trie->revcharmap, tmp ); \
785 })
786
787#define TRIE_READ_CHAR STMT_START { \
788 if ( UTF ) { \
789 if ( folder ) { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
792 foldlen -= len; \
793 scan += len; \
794 len = 0; \
795 } else { \
796 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
800 } \
801 } else { \
802 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
803 } \
804 } else { \
805 uvc = (U32)*uc; \
806 len = 1; \
807 } \
808} STMT_END
809
810
811#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
815
816#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
821 } \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
825} STMT_END
826
827#define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
832} STMT_END
833
834STATIC I32
835S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
836{
837 /* first pass, loop through and scan words */
838 reg_trie_data *trie;
839 regnode *cur;
840 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
841 STRLEN len = 0;
842 UV uvc = 0;
843 U16 curword = 0;
844 U32 next_alloc = 0;
845 /* we just use folder as a flag in utf8 */
846 const U8 *folder=( flags == EXACTF
847 ? PL_fold
848 : ( flags == EXACTFL
849 ? PL_fold_locale
850 : NULL
851 )
852 );
853
854 U32 data_slot = add_data( pRExC_state, 1, "t" );
855 SV *re_trie_maxbuff;
856
857 GET_RE_DEBUG_FLAGS_DECL;
858
859 Newz( 848200, trie, 1, reg_trie_data );
860 trie->refcount = 1;
861 RExC_rx->data->data[ data_slot ] = (void*)trie;
862 Newz( 848201, trie->charmap, 256, U16 );
863 DEBUG_r({
864 trie->words = newAV();
865 trie->revcharmap = newAV();
866 });
867
868
869 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
870 if (!SvIOK(re_trie_maxbuff)) {
871 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
872 }
873
874 /* -- First loop and Setup --
875
876 We first traverse the branches and scan each word to determine if it
877 contains widechars, and how many unique chars there are, this is
878 important as we have to build a table with at least as many columns as we
879 have unique chars.
880
881 We use an array of integers to represent the character codes 0..255
882 (trie->charmap) and we use a an HV* to store unicode characters. We use the
883 native representation of the character value as the key and IV's for the
884 coded index.
885
886 *TODO* If we keep track of how many times each character is used we can
887 remap the columns so that the table compression later on is more
888 efficient in terms of memory by ensuring most common value is in the
889 middle and the least common are on the outside. IMO this would be better
890 than a most to least common mapping as theres a decent chance the most
891 common letter will share a node with the least common, meaning the node
892 will not be compressable. With a middle is most common approach the worst
893 case is when we have the least common nodes twice.
894
895 */
896
897
898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
899 regnode *noper = NEXTOPER( cur );
900 U8 *uc = (U8*)STRING( noper );
901 U8 *e = uc + STR_LEN( noper );
902 STRLEN foldlen = 0;
903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
904 U8 *scan;
905
906 for ( ; uc < e ; uc += len ) {
907 trie->charcount++;
908 TRIE_READ_CHAR;
909 if ( uvc < 256 ) {
910 if ( !trie->charmap[ uvc ] ) {
911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
912 if ( folder )
913 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
914 TRIE_DEBUG_CHAR;
915 }
916 } else {
917 SV** svpp;
918 if ( !trie->widecharmap )
919 trie->widecharmap = newHV();
920
921 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
922
923 if ( !svpp )
924 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%X", uvc );
925
926 if ( !SvTRUE( *svpp ) ) {
927 sv_setiv( *svpp, ++trie->uniquecharcount );
928 TRIE_DEBUG_CHAR;
929 }
930 }
931 }
932 trie->wordcount++;
933 } /* end first pass */
934 DEBUG_TRIE_COMPILE_r(
935 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
936 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
937 trie->charcount, trie->uniquecharcount )
938 );
939
940
941 /*
942 We now know what we are dealing with in terms of unique chars and
943 string sizes so we can calculate how much memory a naive
944 representation using a flat table will take. If its over a reasonable
945 limit (as specified by $^RE_TRIE_MAXBUFF) we use a more memory
946 conservative but potentially much slower representation using an array
947 of lists.
948
949 At the end we convert both representations into the same compressed
950 form that will be used in regexec.c for matching with. The latter
951 is a form that cannot be used to construct with but has memory
952 properties similar to the list form and access properties similar
953 to the table form making it both suitable for fast searches and
954 small enough that its feasable to store for the duration of a program.
955
956 See the comment in the code where the compressed table is produced
957 inplace from the flat tabe representation for an explanation of how
958 the compression works.
959
960 */
961
962
963 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
964 /*
965 Second Pass -- Array Of Lists Representation
966
967 Each state will be represented by a list of charid:state records
968 (reg_trie_trans_le) the first such element holds the CUR and LEN
969 points of the allocated array. (See defines above).
970
971 We build the initial structure using the lists, and then convert
972 it into the compressed table form which allows faster lookups
973 (but cant be modified once converted).
974
975
976 */
977
978
979 STRLEN transcount = 1;
980
981 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
982 TRIE_LIST_NEW(1);
983 next_alloc = 2;
984
985 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
986
987 regnode *noper = NEXTOPER( cur );
988 U8 *uc = (U8*)STRING( noper );
989 U8 *e = uc + STR_LEN( noper );
990 U32 state = 1; /* required init */
991 U16 charid = 0; /* sanity init */
992 U8 *scan = (U8*)NULL; /* sanity init */
993 STRLEN foldlen = 0; /* required init */
994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
995
996
997 for ( ; uc < e ; uc += len ) {
998
999 TRIE_READ_CHAR;
1000
1001 if ( uvc < 256 ) {
1002 charid = trie->charmap[ uvc ];
1003 } else {
1004 SV** svpp=(SV**)NULL;
1005 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1006 if ( !svpp ) {
1007 charid = 0;
1008 } else {
1009 charid=(U16)SvIV( *svpp );
1010 }
1011 }
1012 if ( charid ) {
1013
1014 U16 check;
1015 U32 newstate = 0;
1016
1017 charid--;
1018 if ( !trie->states[ state ].trans.list ) {
1019 TRIE_LIST_NEW( state );
1020 }
1021 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1022 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1023 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1024 break;
1025 }
1026 }
1027 if ( ! newstate ) {
1028 newstate = next_alloc++;
1029 TRIE_LIST_PUSH( state, charid, newstate );
1030 transcount++;
1031 }
1032 state = newstate;
1033
1034 } else {
1035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1036 }
1037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1038 }
1039
1040 if ( !trie->states[ state ].wordnum ) {
1041 /* we havent inserted this word into the structure yet. */
1042 trie->states[ state ].wordnum = ++curword;
1043
1044 DEBUG_r({
1045 /* store the word for dumping */
1046 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1047 if ( UTF ) SvUTF8_on( tmp );
1048 av_push( trie->words, tmp );
1049 });
1050
1051 } else {
1052 /* Its a dupe. So ignore it. */
1053 }
1054
1055 } /* end second pass */
1056
1057 trie->laststate = next_alloc;
1058 Renew( trie->states, next_alloc, reg_trie_state );
1059
1060 DEBUG_TRIE_COMPILE_MORE_r({
1061 U32 state;
1062 U16 charid;
1063
1064 /*
1065 print out the table precompression.
1066 */
1067
1068 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1069 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1070
1071 for( state=1 ; state < next_alloc ; state ++ ) {
1072
1073 PerlIO_printf( Perl_debug_log, "\n %04X :", state );
1074 if ( ! trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, "%5s| ","");
1076 } else {
1077 PerlIO_printf( Perl_debug_log, "W%04X| ",
1078 trie->states[ state ].wordnum
1079 );
1080 }
1081 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1082 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1083 PerlIO_printf( Perl_debug_log, "%s:%3X=%04X | ",
1084 SvPV_nolen( *tmp ),
1085 TRIE_LIST_ITEM(state,charid).forid,
1086 TRIE_LIST_ITEM(state,charid).newstate
1087 );
1088 }
1089
1090 }
1091 PerlIO_printf( Perl_debug_log, "\n\n" );
1092 });
1093
1094 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1095 {
1096 U32 state;
1097 U16 idx;
1098 U32 tp = 0;
1099 U32 zp = 0;
1100
1101
1102 for( state=1 ; state < next_alloc ; state ++ ) {
1103 U32 base=0;
1104
1105 /*
1106 DEBUG_TRIE_COMPILE_MORE_r(
1107 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1108 );
1109 */
1110
1111 if (trie->states[state].trans.list) {
1112 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1113 U16 maxid=minid;
1114
1115
1116 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1117 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1118 minid=TRIE_LIST_ITEM( state, idx).forid;
1119 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1120 maxid=TRIE_LIST_ITEM( state, idx).forid;
1121 }
1122 }
1123 if ( transcount < tp + maxid - minid + 1) {
1124 transcount *= 2;
1125 Renew( trie->trans, transcount, reg_trie_trans );
1126 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1127 }
1128 base = trie->uniquecharcount + tp - minid;
1129 if ( maxid == minid ) {
1130 U32 set = 0;
1131 for ( ; zp < tp ; zp++ ) {
1132 if ( ! trie->trans[ zp ].next ) {
1133 base = trie->uniquecharcount + zp - minid;
1134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1135 trie->trans[ zp ].check = state;
1136 set = 1;
1137 break;
1138 }
1139 }
1140 if ( !set ) {
1141 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1142 trie->trans[ tp ].check = state;
1143 tp++;
1144 zp = tp;
1145 }
1146 } else {
1147 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1148 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1149 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1150 trie->trans[ tid ].check = state;
1151 }
1152 tp += ( maxid - minid + 1 );
1153 }
1154 Safefree(trie->states[ state ].trans.list);
1155 }
1156 /*
1157 DEBUG_TRIE_COMPILE_MORE_r(
1158 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1159 );
1160 */
1161 trie->states[ state ].trans.base=base;
1162 }
cc601c31 1163 trie->lasttrans = tp + 1;
a3621e74
YO
1164 }
1165 } else {
1166 /*
1167 Second Pass -- Flat Table Representation.
1168
1169 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1170 We know that we will need Charcount+1 trans at most to store the data
1171 (one row per char at worst case) So we preallocate both structures
1172 assuming worst case.
1173
1174 We then construct the trie using only the .next slots of the entry
1175 structs.
1176
1177 We use the .check field of the first entry of the node temporarily to
1178 make compression both faster and easier by keeping track of how many non
1179 zero fields are in the node.
1180
1181 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1182 transition.
1183
1184 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1185 number representing the first entry of the node, and state as a
1186 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1187 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1188 are 2 entrys per node. eg:
1189
1190 A B A B
1191 1. 2 4 1. 3 7
1192 2. 0 3 3. 0 5
1193 3. 0 0 5. 0 0
1194 4. 0 0 7. 0 0
1195
1196 The table is internally in the right hand, idx form. However as we also
1197 have to deal with the states array which is indexed by nodenum we have to
1198 use TRIE_NODENUM() to convert.
1199
1200 */
1201
1202 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1203 reg_trie_trans );
1204 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1205 next_alloc = trie->uniquecharcount + 1;
1206
1207 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1208
1209 regnode *noper = NEXTOPER( cur );
1210 U8 *uc = (U8*)STRING( noper );
1211 U8 *e = uc + STR_LEN( noper );
1212
1213 U32 state = 1; /* required init */
1214
1215 U16 charid = 0; /* sanity init */
1216 U32 accept_state = 0; /* sanity init */
1217 U8 *scan = (U8*)NULL; /* sanity init */
1218
1219 STRLEN foldlen = 0; /* required init */
1220 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1221
1222
1223 for ( ; uc < e ; uc += len ) {
1224
1225 TRIE_READ_CHAR;
1226
1227 if ( uvc < 256 ) {
1228 charid = trie->charmap[ uvc ];
1229 } else {
1230 SV** svpp=(SV**)NULL;
1231 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1232 if ( !svpp ) {
1233 charid = 0;
1234 } else {
1235 charid=(U16)SvIV( *svpp );
1236 }
1237 }
1238 if ( charid ) {
1239 charid--;
1240 if ( !trie->trans[ state + charid ].next ) {
1241 trie->trans[ state + charid ].next = next_alloc;
1242 trie->trans[ state ].check++;
1243 next_alloc += trie->uniquecharcount;
1244 }
1245 state = trie->trans[ state + charid ].next;
1246 } else {
1247 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1248 }
1249 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1250 }
1251
1252 accept_state = TRIE_NODENUM( state );
1253 if ( !trie->states[ accept_state ].wordnum ) {
1254 /* we havent inserted this word into the structure yet. */
1255 trie->states[ accept_state ].wordnum = ++curword;
1256
1257 DEBUG_r({
1258 /* store the word for dumping */
1259 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1260 if ( UTF ) SvUTF8_on( tmp );
1261 av_push( trie->words, tmp );
1262 });
1263
1264 } else {
1265 /* Its a dupe. So ignore it. */
1266 }
1267
1268 } /* end second pass */
1269
1270 DEBUG_TRIE_COMPILE_MORE_r({
1271 /*
1272 print out the table precompression so that we can do a visual check
1273 that they are identical.
1274 */
1275 U32 state;
1276 U16 charid;
1277 PerlIO_printf( Perl_debug_log, "\nChar : " );
1278
1279 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1280 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1281 if ( tmp ) {
1282 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1283 }
1284 }
1285
1286 PerlIO_printf( Perl_debug_log, "\nState+-" );
1287
1288 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1289 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1290 }
1291
1292 PerlIO_printf( Perl_debug_log, "\n" );
1293
1294 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1295
1296 PerlIO_printf( Perl_debug_log, "%04X : ", TRIE_NODENUM( state ) );
1297
1298 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1299 PerlIO_printf( Perl_debug_log, "%04X ",
1300 SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1301 }
1302 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1303 PerlIO_printf( Perl_debug_log, " (%04X)\n", trie->trans[ state ].check );
1304 } else {
1305 PerlIO_printf( Perl_debug_log, " (%04X) W%04X\n", trie->trans[ state ].check,
1306 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1307 }
1308 }
1309 PerlIO_printf( Perl_debug_log, "\n\n" );
1310 });
1311 {
1312 /*
1313 * Inplace compress the table.*
1314
1315 For sparse data sets the table constructed by the trie algorithm will
1316 be mostly 0/FAIL transitions or to put it another way mostly empty.
1317 (Note that leaf nodes will not contain any transitions.)
1318
1319 This algorithm compresses the tables by eliminating most such
1320 transitions, at the cost of a modest bit of extra work during lookup:
1321
1322 - Each states[] entry contains a .base field which indicates the
1323 index in the state[] array wheres its transition data is stored.
1324
1325 - If .base is 0 there are no valid transitions from that node.
1326
1327 - If .base is nonzero then charid is added to it to find an entry in
1328 the trans array.
1329
1330 -If trans[states[state].base+charid].check!=state then the
1331 transition is taken to be a 0/Fail transition. Thus if there are fail
1332 transitions at the front of the node then the .base offset will point
1333 somewhere inside the previous nodes data (or maybe even into a node
1334 even earlier), but the .check field determines if the transition is
1335 valid.
1336
1337 The following process inplace converts the table to the compressed
1338 table: We first do not compress the root node 1,and mark its all its
1339 .check pointers as 1 and set its .base pointer as 1 as well. This
1340 allows to do a DFA construction from the compressed table later, and
1341 ensures that any .base pointers we calculate later are greater than
1342 0.
1343
1344 - We set 'pos' to indicate the first entry of the second node.
1345
1346 - We then iterate over the columns of the node, finding the first and
1347 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1348 and set the .check pointers accordingly, and advance pos
1349 appropriately and repreat for the next node. Note that when we copy
1350 the next pointers we have to convert them from the original
1351 NODEIDX form to NODENUM form as the former is not valid post
1352 compression.
1353
1354 - If a node has no transitions used we mark its base as 0 and do not
1355 advance the pos pointer.
1356
1357 - If a node only has one transition we use a second pointer into the
1358 structure to fill in allocated fail transitions from other states.
1359 This pointer is independent of the main pointer and scans forward
1360 looking for null transitions that are allocated to a state. When it
1361 finds one it writes the single transition into the "hole". If the
1362 pointer doesnt find one the single transition is appeneded as normal.
1363
1364 - Once compressed we can Renew/realloc the structures to release the
1365 excess space.
1366
1367 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1368 specifically Fig 3.47 and the associated pseudocode.
1369
1370 demq
1371 */
1372 U32 laststate = TRIE_NODENUM( next_alloc );
1373 U32 used , state, charid;
1374 U32 pos = 0, zp=0;
1375 trie->laststate = laststate;
1376
1377 for ( state = 1 ; state < laststate ; state++ ) {
1378 U8 flag = 0;
1379 U32 stateidx = TRIE_NODEIDX( state );
1380 U32 o_used=trie->trans[ stateidx ].check;
1381 used = trie->trans[ stateidx ].check;
1382 trie->trans[ stateidx ].check = 0;
1383
1384 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1385 if ( flag || trie->trans[ stateidx + charid ].next ) {
1386 if ( trie->trans[ stateidx + charid ].next ) {
1387 if (o_used == 1) {
1388 for ( ; zp < pos ; zp++ ) {
1389 if ( ! trie->trans[ zp ].next ) {
1390 break;
1391 }
1392 }
1393 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1394 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1395 trie->trans[ zp ].check = state;
1396 if ( ++zp > pos ) pos = zp;
1397 break;
1398 }
1399 used--;
1400 }
1401 if ( !flag ) {
1402 flag = 1;
1403 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1404 }
1405 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1406 trie->trans[ pos ].check = state;
1407 pos++;
1408 }
1409 }
1410 }
cc601c31 1411 trie->lasttrans = pos + 1;
a3621e74
YO
1412 Renew( trie->states, laststate + 1, reg_trie_state);
1413 DEBUG_TRIE_COMPILE_MORE_r(
1414 PerlIO_printf( Perl_debug_log, " Alloc: %d Orig: %d elements, Final:%d. Savings of %%%5.2f\n",
1415 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), next_alloc, pos,
1416 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1417 );
1418
1419 } /* end table compress */
1420 }
cc601c31
YO
1421 /* resize the trans array to remove unused space */
1422 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1423
1424 DEBUG_TRIE_COMPILE_r({
1425 U32 state;
1426 /*
1427 Now we print it out again, in a slightly different form as there is additional
1428 info we want to be able to see when its compressed. They are close enough for
1429 visual comparison though.
1430 */
1431 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1432
1433 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1434 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1435 if ( tmp ) {
1436 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1437 }
1438 }
1439 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1440
a3621e74
YO
1441 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1442 PerlIO_printf( Perl_debug_log, "-----");
1443 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1444
a3621e74
YO
1445 for( state = 1 ; state < trie->laststate ; state++ ) {
1446 U32 base = trie->states[ state ].trans.base;
1447
1448 PerlIO_printf( Perl_debug_log, "#%04X ", state);
1449
1450 if ( trie->states[ state ].wordnum ) {
1451 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1452 } else {
1453 PerlIO_printf( Perl_debug_log, "%6s", "" );
1454 }
1455
1456 PerlIO_printf( Perl_debug_log, " @%04X ", base );
1457
1458 if ( base ) {
1459 U32 ofs = 0;
1460
cc601c31
YO
1461 while( ( base + ofs < trie->uniquecharcount ) ||
1462 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1463 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1464 ofs++;
1465
1466 PerlIO_printf( Perl_debug_log, "+%02X[ ", ofs);
1467
1468 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1469 if ( ( base + ofs >= trie->uniquecharcount ) &&
1470 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1471 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1472 {
1473 PerlIO_printf( Perl_debug_log, "%04X ",
1474 trie->trans[ base + ofs - trie->uniquecharcount ].next );
1475 } else {
1476 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1477 }
1478 }
1479
1480 PerlIO_printf( Perl_debug_log, "]", ofs);
1481
1482 }
1483 PerlIO_printf( Perl_debug_log, "\n" );
1484 }
1485 });
1486
1487 {
1488 /* now finally we "stitch in" the new TRIE node
1489 This means we convert either the first branch or the first Exact,
1490 depending on whether the thing following (in 'last') is a branch
1491 or not and whther first is the startbranch (ie is it a sub part of
1492 the alternation or is it the whole thing.)
1493 Assuming its a sub part we conver the EXACT otherwise we convert
1494 the whole branch sequence, including the first.
1495 */
1496 regnode *convert;
1497
1498
1499
1500
1501 if ( first == startbranch && OP( last ) != BRANCH ) {
1502 convert = first;
1503 } else {
1504 convert = NEXTOPER( first );
1505 NEXT_OFF( first ) = (U16)(last - first);
1506 }
1507
1508 OP( convert ) = TRIE + (U8)( flags - EXACT );
1509 NEXT_OFF( convert ) = (U16)(tail - convert);
1510 ARG_SET( convert, data_slot );
1511
1512 /* tells us if we need to handle accept buffers specially */
1513 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1514
1515
1516 /* needed for dumping*/
1517 DEBUG_r({
1518 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1519 /* We now need to mark all of the space originally used by the
1520 branches as optimized away. This keeps the dumpuntil from
1521 throwing a wobbly as it doesnt use regnext() to traverse the
1522 opcodes.
1523 */
1524 while( optimize < last ) {
1525 OP( optimize ) = OPTIMIZED;
1526 optimize++;
1527 }
1528 });
1529 } /* end node insert */
1530 return 1;
1531}
1532
1533
1534
1535/*
5d1c421c
JH
1536 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1537 * These need to be revisited when a newer toolchain becomes available.
1538 */
1539#if defined(__sparc64__) && defined(__GNUC__)
1540# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1541# undef SPARC64_GCC_WORKAROUND
1542# define SPARC64_GCC_WORKAROUND 1
1543# endif
1544#endif
1545
653099ff
GS
1546/* REx optimizer. Converts nodes into quickier variants "in place".
1547 Finds fixed substrings. */
1548
c277df42
IZ
1549/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1550 to the position after last scanned or to NULL. */
1551
a3621e74 1552
76e3520e 1553STATIC I32
a3621e74 1554S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1555 /* scanp: Start here (read-write). */
1556 /* deltap: Write maxlen-minlen here. */
1557 /* last: Stop before this one. */
1558{
1559 I32 min = 0, pars = 0, code;
1560 regnode *scan = *scanp, *next;
1561 I32 delta = 0;
1562 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1563 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1564 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1565 scan_data_t data_fake;
653099ff 1566 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1567 SV *re_trie_maxbuff = NULL;
1568
1569 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1570
c277df42
IZ
1571 while (scan && OP(scan) != END && scan < last) {
1572 /* Peephole optimizer: */
a3621e74
YO
1573 DEBUG_OPTIMISE_r({
1574 SV *mysv=sv_newmortal();
1575 regprop( mysv, scan);
1576 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08X)\n",depth*2,"",SvPV_nolen(mysv),scan);
1577 });
c277df42 1578
22c35a8c 1579 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1580 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1581 regnode *n = regnext(scan);
1582 U32 stringok = 1;
1583#ifdef DEBUGGING
1584 regnode *stop = scan;
b81d288d 1585#endif
c277df42 1586
cd439c50 1587 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1588 /* Skip NOTHING, merge EXACT*. */
1589 while (n &&
b81d288d 1590 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1591 (stringok && (OP(n) == OP(scan))))
1592 && NEXT_OFF(n)
1593 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1594 if (OP(n) == TAIL || n > next)
1595 stringok = 0;
22c35a8c 1596 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1597 NEXT_OFF(scan) += NEXT_OFF(n);
1598 next = n + NODE_STEP_REGNODE;
1599#ifdef DEBUGGING
1600 if (stringok)
1601 stop = n;
b81d288d 1602#endif
c277df42 1603 n = regnext(n);
a0ed51b3 1604 }
f49d4d0f 1605 else if (stringok) {
cd439c50 1606 int oldl = STR_LEN(scan);
c277df42 1607 regnode *nnext = regnext(n);
f49d4d0f 1608
b81d288d 1609 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1610 break;
1611 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1612 STR_LEN(scan) += STR_LEN(n);
1613 next = n + NODE_SZ_STR(n);
c277df42 1614 /* Now we can overwrite *n : */
f49d4d0f 1615 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1616#ifdef DEBUGGING
f49d4d0f 1617 stop = next - 1;
b81d288d 1618#endif
c277df42
IZ
1619 n = nnext;
1620 }
1621 }
61a36c01 1622
a3621e74 1623 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1624/*
1625 Two problematic code points in Unicode casefolding of EXACT nodes:
1626
1627 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1628 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1629
1630 which casefold to
1631
1632 Unicode UTF-8
1633
1634 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1635 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1636
1637 This means that in case-insensitive matching (or "loose matching",
1638 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1639 length of the above casefolded versions) can match a target string
1640 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1641 This would rather mess up the minimum length computation.
1642
1643 What we'll do is to look for the tail four bytes, and then peek
1644 at the preceding two bytes to see whether we need to decrease
1645 the minimum length by four (six minus two).
1646
1647 Thanks to the design of UTF-8, there cannot be false matches:
1648 A sequence of valid UTF-8 bytes cannot be a subsequence of
1649 another valid sequence of UTF-8 bytes.
1650
1651*/
1652 char *s0 = STRING(scan), *s, *t;
1653 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
bfed75c6
AL
1654 const char *t0 = "\xcc\x88\xcc\x81";
1655 const char *t1 = t0 + 3;
61a36c01
JH
1656
1657 for (s = s0 + 2;
1658 s < s2 && (t = ninstr(s, s1, t0, t1));
1659 s = t + 4) {
1660 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1661 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1662 min -= 4;
1663 }
1664 }
1665
c277df42
IZ
1666#ifdef DEBUGGING
1667 /* Allow dumping */
cd439c50 1668 n = scan + NODE_SZ_STR(scan);
c277df42 1669 while (n <= stop) {
22c35a8c 1670 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1671 OP(n) = OPTIMIZED;
1672 NEXT_OFF(n) = 0;
1673 }
1674 n++;
1675 }
653099ff 1676#endif
c277df42 1677 }
a3621e74
YO
1678
1679
1680
653099ff
GS
1681 /* Follow the next-chain of the current node and optimize
1682 away all the NOTHINGs from it. */
c277df42 1683 if (OP(scan) != CURLYX) {
048cfca1
GS
1684 int max = (reg_off_by_arg[OP(scan)]
1685 ? I32_MAX
1686 /* I32 may be smaller than U16 on CRAYs! */
1687 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1688 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1689 int noff;
1690 regnode *n = scan;
b81d288d 1691
c277df42
IZ
1692 /* Skip NOTHING and LONGJMP. */
1693 while ((n = regnext(n))
22c35a8c 1694 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1695 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1696 && off + noff < max)
1697 off += noff;
1698 if (reg_off_by_arg[OP(scan)])
1699 ARG(scan) = off;
b81d288d 1700 else
c277df42
IZ
1701 NEXT_OFF(scan) = off;
1702 }
a3621e74 1703
653099ff
GS
1704 /* The principal pseudo-switch. Cannot be a switch, since we
1705 look into several different things. */
b81d288d 1706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1707 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1708 next = regnext(scan);
1709 code = OP(scan);
a3621e74 1710 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1711
1712 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1713 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1714 struct regnode_charclass_class accum;
a3621e74 1715 regnode *startbranch=scan;
c277df42 1716
653099ff 1717 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1718 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1719 if (flags & SCF_DO_STCLASS)
830247a4 1720 cl_init_zero(pRExC_state, &accum);
a3621e74 1721
c277df42 1722 while (OP(scan) == code) {
830247a4 1723 I32 deltanext, minnext, f = 0, fake;
653099ff 1724 struct regnode_charclass_class this_class;
c277df42
IZ
1725
1726 num++;
1727 data_fake.flags = 0;
b81d288d 1728 if (data) {
2c2d71f5 1729 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1730 data_fake.last_closep = data->last_closep;
1731 }
1732 else
1733 data_fake.last_closep = &fake;
c277df42
IZ
1734 next = regnext(scan);
1735 scan = NEXTOPER(scan);
1736 if (code != BRANCH)
1737 scan = NEXTOPER(scan);
653099ff 1738 if (flags & SCF_DO_STCLASS) {
830247a4 1739 cl_init(pRExC_state, &this_class);
653099ff
GS
1740 data_fake.start_class = &this_class;
1741 f = SCF_DO_STCLASS_AND;
b81d288d 1742 }
e1901655
IZ
1743 if (flags & SCF_WHILEM_VISITED_POS)
1744 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1745
653099ff 1746 /* we suppose the run is continuous, last=next...*/
830247a4 1747 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1748 next, &data_fake, f,depth+1);
b81d288d 1749 if (min1 > minnext)
c277df42
IZ
1750 min1 = minnext;
1751 if (max1 < minnext + deltanext)
1752 max1 = minnext + deltanext;
1753 if (deltanext == I32_MAX)
aca2d497 1754 is_inf = is_inf_internal = 1;
c277df42
IZ
1755 scan = next;
1756 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1757 pars++;
405ff068 1758 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1759 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1760 if (data)
1761 data->whilem_c = data_fake.whilem_c;
653099ff 1762 if (flags & SCF_DO_STCLASS)
830247a4 1763 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1764 if (code == SUSPEND)
c277df42
IZ
1765 break;
1766 }
1767 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1768 min1 = 0;
1769 if (flags & SCF_DO_SUBSTR) {
1770 data->pos_min += min1;
1771 data->pos_delta += max1 - min1;
1772 if (max1 != min1 || is_inf)
1773 data->longest = &(data->longest_float);
1774 }
1775 min += min1;
1776 delta += max1 - min1;
653099ff 1777 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1778 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1779 if (min1) {
1780 cl_and(data->start_class, &and_with);
1781 flags &= ~SCF_DO_STCLASS;
1782 }
1783 }
1784 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1785 if (min1) {
1786 cl_and(data->start_class, &accum);
653099ff 1787 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1788 }
1789 else {
b81d288d 1790 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1791 * data->start_class */
1792 StructCopy(data->start_class, &and_with,
1793 struct regnode_charclass_class);
1794 flags &= ~SCF_DO_STCLASS_AND;
1795 StructCopy(&accum, data->start_class,
1796 struct regnode_charclass_class);
1797 flags |= SCF_DO_STCLASS_OR;
1798 data->start_class->flags |= ANYOF_EOS;
1799 }
653099ff 1800 }
a3621e74
YO
1801
1802 /* demq.
1803
1804 Assuming this was/is a branch we are dealing with: 'scan' now
1805 points at the item that follows the branch sequence, whatever
1806 it is. We now start at the beginning of the sequence and look
1807 for subsequences of
1808
1809 BRANCH->EXACT=>X
1810 BRANCH->EXACT=>X
1811
1812 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1813
1814 If we can find such a subseqence we need to turn the first
1815 element into a trie and then add the subsequent branch exact
1816 strings to the trie.
1817
1818 We have two cases
1819
1820 1. patterns where the whole set of branch can be converted to a trie,
1821
1822 2. patterns where only a subset of the alternations can be
1823 converted to a trie.
1824
1825 In case 1 we can replace the whole set with a single regop
1826 for the trie. In case 2 we need to keep the start and end
1827 branchs so
1828
1829 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1830 becomes BRANCH TRIE; BRANCH X;
1831
1832 Hypthetically when we know the regex isnt anchored we can
1833 turn a case 1 into a DFA and let it rip... Every time it finds a match
1834 it would just call its tail, no WHILEM/CURLY needed.
1835
1836 */
1837 if (DO_TRIE) {
1838 if (!re_trie_maxbuff) {
1839 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
1840 if (!SvIOK(re_trie_maxbuff))
1841 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
1842
1843 }
1844 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1845 regnode *cur;
1846 regnode *first = (regnode *)NULL;
1847 regnode *last = (regnode *)NULL;
1848 regnode *tail = scan;
1849 U8 optype = 0;
1850 U32 count=0;
1851
1852#ifdef DEBUGGING
1853 SV *mysv = sv_newmortal(); /* for dumping */
1854#endif
1855 /* var tail is used because there may be a TAIL
1856 regop in the way. Ie, the exacts will point to the
1857 thing following the TAIL, but the last branch will
1858 point at the TAIL. So we advance tail. If we
1859 have nested (?:) we may have to move through several
1860 tails.
1861 */
1862
1863 while ( OP( tail ) == TAIL ) {
1864 /* this is the TAIL generated by (?:) */
1865 tail = regnext( tail );
1866 }
1867
1868 DEBUG_OPTIMISE_r({
1869 regprop( mysv, tail );
1870 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1871 depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1872 (RExC_seen_evals) ? "[EVAL]" : ""
1873 );
1874 });
1875 /*
1876
1877 step through the branches, cur represents each
1878 branch, noper is the first thing to be matched
1879 as part of that branch and noper_next is the
1880 regnext() of that node. if noper is an EXACT
1881 and noper_next is the same as scan (our current
1882 position in the regex) then the EXACT branch is
1883 a possible optimization target. Once we have
1884 two or more consequetive such branches we can
1885 create a trie of the EXACT's contents and stich
1886 it in place. If the sequence represents all of
1887 the branches we eliminate the whole thing and
1888 replace it with a single TRIE. If it is a
1889 subsequence then we need to stitch it in. This
1890 means the first branch has to remain, and needs
1891 to be repointed at the item on the branch chain
1892 following the last branch optimized. This could
1893 be either a BRANCH, in which case the
1894 subsequence is internal, or it could be the
1895 item following the branch sequence in which
1896 case the subsequence is at the end.
1897
1898 */
1899
1900 /* dont use tail as the end marker for this traverse */
1901 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1902 regnode *noper = NEXTOPER( cur );
1903 regnode *noper_next = regnext( noper );
1904
1905
1906 DEBUG_OPTIMISE_r({
1907 regprop( mysv, cur);
1908 PerlIO_printf( Perl_debug_log, "%*s%s",
1909 depth * 2 + 2," ", SvPV_nolen( mysv ) );
1910
1911 regprop( mysv, noper);
1912 PerlIO_printf( Perl_debug_log, " -> %s",
1913 SvPV_nolen(mysv));
1914
1915 if ( noper_next ) {
1916 regprop( mysv, noper_next );
1917 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1918 SvPV_nolen(mysv));
1919 }
1920 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1921 first, last, cur );
1922 });
1923 if ( ( first ? OP( noper ) == optype
1924 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1925 && noper_next == tail && count<U16_MAX)
1926 {
1927 count++;
1928 if ( !first ) {
1929 first = cur;
1930 optype = OP( noper );
1931 } else {
1932 DEBUG_OPTIMISE_r(
1933 if (!last ) {
1934 regprop( mysv, first);
1935 PerlIO_printf( Perl_debug_log, "%*s%s",
1936 depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1937 regprop( mysv, NEXTOPER(first) );
1938 PerlIO_printf( Perl_debug_log, " -> %s\n",
1939 SvPV_nolen( mysv ) );
1940 }
1941 );
1942 last = cur;
1943 DEBUG_OPTIMISE_r({
1944 regprop( mysv, cur);
1945 PerlIO_printf( Perl_debug_log, "%*s%s",
1946 depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1947 regprop( mysv, noper );
1948 PerlIO_printf( Perl_debug_log, " -> %s\n",
1949 SvPV_nolen( mysv ) );
1950 });
1951 }
1952 } else {
1953 if ( last ) {
1954 DEBUG_OPTIMISE_r(
1955 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1956 depth * 2 + 2, "E:", "**END**" );
1957 );
1958 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1959 }
1960 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1961 && noper_next == tail )
1962 {
1963 count = 1;
1964 first = cur;
1965 optype = OP( noper );
1966 } else {
1967 count = 0;
1968 first = NULL;
1969 optype = 0;
1970 }
1971 last = NULL;
1972 }
1973 }
1974 DEBUG_OPTIMISE_r({
1975 regprop( mysv, cur);
1976 PerlIO_printf( Perl_debug_log,
1977 "%*s%s\t(0x%p,0x%p,0x%p)\n", depth * 2 + 2,
1978 " ", SvPV_nolen( mysv ), first, last, cur);
1979
1980 });
1981 if ( last ) {
1982 DEBUG_OPTIMISE_r(
1983 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1984 depth * 2 + 2, "E:", "==END==" );
1985 );
1986 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1987 }
1988 }
1989 }
a0ed51b3 1990 }
a3621e74 1991 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1992 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1993 } else /* single branch is optimized. */
c277df42
IZ
1994 scan = NEXTOPER(scan);
1995 continue;
a0ed51b3
LW
1996 }
1997 else if (OP(scan) == EXACT) {
cd439c50 1998 I32 l = STR_LEN(scan);
1aa99e6b 1999 UV uc = *((U8*)STRING(scan));
a0ed51b3 2000 if (UTF) {
1aa99e6b
IH
2001 U8 *s = (U8*)STRING(scan);
2002 l = utf8_length(s, s + l);
9041c2e3 2003 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2004 }
2005 min += l;
c277df42 2006 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2007 /* The code below prefers earlier match for fixed
2008 offset, later match for variable offset. */
2009 if (data->last_end == -1) { /* Update the start info. */
2010 data->last_start_min = data->pos_min;
2011 data->last_start_max = is_inf
b81d288d 2012 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2013 }
cd439c50 2014 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2015 {
2016 SV * sv = data->last_found;
2017 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2018 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2019 if (mg && mg->mg_len >= 0)
5e43f467
JH
2020 mg->mg_len += utf8_length((U8*)STRING(scan),
2021 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2022 }
33b8afdf
JH
2023 if (UTF)
2024 SvUTF8_on(data->last_found);
c277df42
IZ
2025 data->last_end = data->pos_min + l;
2026 data->pos_min += l; /* As in the first entry. */
2027 data->flags &= ~SF_BEFORE_EOL;
2028 }
653099ff
GS
2029 if (flags & SCF_DO_STCLASS_AND) {
2030 /* Check whether it is compatible with what we know already! */
2031 int compat = 1;
2032
1aa99e6b 2033 if (uc >= 0x100 ||
516a5887 2034 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2035 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2036 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2037 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2038 )
653099ff
GS
2039 compat = 0;
2040 ANYOF_CLASS_ZERO(data->start_class);
2041 ANYOF_BITMAP_ZERO(data->start_class);
2042 if (compat)
1aa99e6b 2043 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2044 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2045 if (uc < 0x100)
2046 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2047 }
2048 else if (flags & SCF_DO_STCLASS_OR) {
2049 /* false positive possible if the class is case-folded */
1aa99e6b 2050 if (uc < 0x100)
9b877dbb
IH
2051 ANYOF_BITMAP_SET(data->start_class, uc);
2052 else
2053 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2054 data->start_class->flags &= ~ANYOF_EOS;
2055 cl_and(data->start_class, &and_with);
2056 }
2057 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2058 }
653099ff 2059 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2060 I32 l = STR_LEN(scan);
1aa99e6b 2061 UV uc = *((U8*)STRING(scan));
653099ff
GS
2062
2063 /* Search for fixed substrings supports EXACT only. */
b81d288d 2064 if (flags & SCF_DO_SUBSTR)
830247a4 2065 scan_commit(pRExC_state, data);
a0ed51b3 2066 if (UTF) {
1aa99e6b
IH
2067 U8 *s = (U8 *)STRING(scan);
2068 l = utf8_length(s, s + l);
9041c2e3 2069 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2070 }
2071 min += l;
c277df42 2072 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2073 data->pos_min += l;
653099ff
GS
2074 if (flags & SCF_DO_STCLASS_AND) {
2075 /* Check whether it is compatible with what we know already! */
2076 int compat = 1;
2077
1aa99e6b 2078 if (uc >= 0x100 ||
516a5887 2079 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2080 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2081 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2082 compat = 0;
2083 ANYOF_CLASS_ZERO(data->start_class);
2084 ANYOF_BITMAP_ZERO(data->start_class);
2085 if (compat) {
1aa99e6b 2086 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2087 data->start_class->flags &= ~ANYOF_EOS;
2088 data->start_class->flags |= ANYOF_FOLD;
2089 if (OP(scan) == EXACTFL)
2090 data->start_class->flags |= ANYOF_LOCALE;
2091 }
2092 }
2093 else if (flags & SCF_DO_STCLASS_OR) {
2094 if (data->start_class->flags & ANYOF_FOLD) {
2095 /* false positive possible if the class is case-folded.
2096 Assume that the locale settings are the same... */
1aa99e6b
IH
2097 if (uc < 0x100)
2098 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2099 data->start_class->flags &= ~ANYOF_EOS;
2100 }
2101 cl_and(data->start_class, &and_with);
2102 }
2103 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2104 }
bfed75c6 2105 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2106 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2107 I32 f = flags, pos_before = 0;
c277df42 2108 regnode *oscan = scan;
653099ff
GS
2109 struct regnode_charclass_class this_class;
2110 struct regnode_charclass_class *oclass = NULL;
727f22e3 2111 I32 next_is_eval = 0;
653099ff 2112
22c35a8c 2113 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2114 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2115 scan = NEXTOPER(scan);
2116 goto finish;
2117 case PLUS:
653099ff 2118 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2119 next = NEXTOPER(scan);
653099ff 2120 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2121 mincount = 1;
2122 maxcount = REG_INFTY;
c277df42
IZ
2123 next = regnext(scan);
2124 scan = NEXTOPER(scan);
2125 goto do_curly;
2126 }
2127 }
2128 if (flags & SCF_DO_SUBSTR)
2129 data->pos_min++;
2130 min++;
2131 /* Fall through. */
2132 case STAR:
653099ff
GS
2133 if (flags & SCF_DO_STCLASS) {
2134 mincount = 0;
b81d288d 2135 maxcount = REG_INFTY;
653099ff
GS
2136 next = regnext(scan);
2137 scan = NEXTOPER(scan);
2138 goto do_curly;
2139 }
b81d288d 2140 is_inf = is_inf_internal = 1;
c277df42
IZ
2141 scan = regnext(scan);
2142 if (flags & SCF_DO_SUBSTR) {
830247a4 2143 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2144 data->longest = &(data->longest_float);
2145 }
2146 goto optimize_curly_tail;
2147 case CURLY:
b81d288d 2148 mincount = ARG1(scan);
c277df42
IZ
2149 maxcount = ARG2(scan);
2150 next = regnext(scan);
cb434fcc
IZ
2151 if (OP(scan) == CURLYX) {
2152 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2153 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2154 }
c277df42 2155 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2156 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2157 do_curly:
2158 if (flags & SCF_DO_SUBSTR) {
830247a4 2159 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2160 pos_before = data->pos_min;
2161 }
2162 if (data) {
2163 fl = data->flags;
2164 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2165 if (is_inf)
2166 data->flags |= SF_IS_INF;
2167 }
653099ff 2168 if (flags & SCF_DO_STCLASS) {
830247a4 2169 cl_init(pRExC_state, &this_class);
653099ff
GS
2170 oclass = data->start_class;
2171 data->start_class = &this_class;
2172 f |= SCF_DO_STCLASS_AND;
2173 f &= ~SCF_DO_STCLASS_OR;
2174 }
e1901655
IZ
2175 /* These are the cases when once a subexpression
2176 fails at a particular position, it cannot succeed
2177 even after backtracking at the enclosing scope.
b81d288d 2178
e1901655
IZ
2179 XXXX what if minimal match and we are at the
2180 initial run of {n,m}? */
2181 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2182 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2183
c277df42 2184 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2185 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2186 (mincount == 0
2187 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2188
2189 if (flags & SCF_DO_STCLASS)
2190 data->start_class = oclass;
2191 if (mincount == 0 || minnext == 0) {
2192 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2193 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2194 }
2195 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2196 /* Switch to OR mode: cache the old value of
653099ff
GS
2197 * data->start_class */
2198 StructCopy(data->start_class, &and_with,
2199 struct regnode_charclass_class);
2200 flags &= ~SCF_DO_STCLASS_AND;
2201 StructCopy(&this_class, data->start_class,
2202 struct regnode_charclass_class);
2203 flags |= SCF_DO_STCLASS_OR;
2204 data->start_class->flags |= ANYOF_EOS;
2205 }
2206 } else { /* Non-zero len */
2207 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2208 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2209 cl_and(data->start_class, &and_with);
2210 }
2211 else if (flags & SCF_DO_STCLASS_AND)
2212 cl_and(data->start_class, &this_class);
2213 flags &= ~SCF_DO_STCLASS;
2214 }
c277df42
IZ
2215 if (!scan) /* It was not CURLYX, but CURLY. */
2216 scan = next;
84037bb0 2217 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2218 /* ? quantifier ok, except for (?{ ... }) */
2219 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2220 && (minnext == 0) && (deltanext == 0)
99799961 2221 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2222 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2223 {
830247a4 2224 vWARN(RExC_parse,
b45f050a
JF
2225 "Quantifier unexpected on zero-length expression");
2226 }
2227
c277df42 2228 min += minnext * mincount;
b81d288d 2229 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2230 && (minnext + deltanext) > 0)
2231 || deltanext == I32_MAX);
aca2d497 2232 is_inf |= is_inf_internal;
c277df42
IZ
2233 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2234
2235 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2236 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2237 && data->flags & SF_IN_PAR
2238 && !(data->flags & SF_HAS_EVAL)
2239 && !deltanext && minnext == 1 ) {
2240 /* Try to optimize to CURLYN. */
2241 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2242 regnode *nxt1 = nxt;
2243#ifdef DEBUGGING
2244 regnode *nxt2;
2245#endif
c277df42
IZ
2246
2247 /* Skip open. */
2248 nxt = regnext(nxt);
bfed75c6 2249 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2250 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2251 && STR_LEN(nxt) == 1))
c277df42 2252 goto nogo;
497b47a8 2253#ifdef DEBUGGING
c277df42 2254 nxt2 = nxt;
497b47a8 2255#endif
c277df42 2256 nxt = regnext(nxt);
b81d288d 2257 if (OP(nxt) != CLOSE)
c277df42
IZ
2258 goto nogo;
2259 /* Now we know that nxt2 is the only contents: */
eb160463 2260 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2261 OP(oscan) = CURLYN;
2262 OP(nxt1) = NOTHING; /* was OPEN. */
2263#ifdef DEBUGGING
2264 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2265 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2266 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2267 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268 OP(nxt + 1) = OPTIMIZED; /* was count. */
2269 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2270#endif
c277df42 2271 }
c277df42
IZ
2272 nogo:
2273
2274 /* Try optimization CURLYX => CURLYM. */
b81d288d 2275 if ( OP(oscan) == CURLYX && data
c277df42 2276 && !(data->flags & SF_HAS_PAR)
c277df42 2277 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2278 && !deltanext /* atom is fixed width */
2279 && minnext != 0 /* CURLYM can't handle zero width */
2280 ) {
c277df42
IZ
2281 /* XXXX How to optimize if data == 0? */
2282 /* Optimize to a simpler form. */
2283 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2284 regnode *nxt2;
2285
2286 OP(oscan) = CURLYM;
2287 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2288 && (OP(nxt2) != WHILEM))
c277df42
IZ
2289 nxt = nxt2;
2290 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2291 /* Need to optimize away parenths. */
2292 if (data->flags & SF_IN_PAR) {
2293 /* Set the parenth number. */
2294 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2295
b81d288d 2296 if (OP(nxt) != CLOSE)
b45f050a 2297 FAIL("Panic opt close");
eb160463 2298 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2299 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2300 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2301#ifdef DEBUGGING
2302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2303 OP(nxt + 1) = OPTIMIZED; /* was count. */
2304 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2305 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2306#endif
c277df42
IZ
2307#if 0
2308 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2309 regnode *nnxt = regnext(nxt1);
b81d288d 2310
c277df42
IZ
2311 if (nnxt == nxt) {
2312 if (reg_off_by_arg[OP(nxt1)])
2313 ARG_SET(nxt1, nxt2 - nxt1);
2314 else if (nxt2 - nxt1 < U16_MAX)
2315 NEXT_OFF(nxt1) = nxt2 - nxt1;
2316 else
2317 OP(nxt) = NOTHING; /* Cannot beautify */
2318 }
2319 nxt1 = nnxt;
2320 }
2321#endif
2322 /* Optimize again: */
b81d288d 2323 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2324 NULL, 0,depth+1);
a0ed51b3
LW
2325 }
2326 else
c277df42 2327 oscan->flags = 0;
c277df42 2328 }
e1901655
IZ
2329 else if ((OP(oscan) == CURLYX)
2330 && (flags & SCF_WHILEM_VISITED_POS)
2331 /* See the comment on a similar expression above.
2332 However, this time it not a subexpression
2333 we care about, but the expression itself. */
2334 && (maxcount == REG_INFTY)
2335 && data && ++data->whilem_c < 16) {
2336 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2337 /* Find WHILEM (as in regexec.c) */
2338 regnode *nxt = oscan + NEXT_OFF(oscan);
2339
2340 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2341 nxt += ARG(nxt);
eb160463
GS
2342 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2343 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2344 }
b81d288d 2345 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2346 pars++;
2347 if (flags & SCF_DO_SUBSTR) {
2348 SV *last_str = Nullsv;
2349 int counted = mincount != 0;
2350
2351 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2352#if defined(SPARC64_GCC_WORKAROUND)
2353 I32 b = 0;
2354 STRLEN l = 0;
2355 char *s = NULL;
2356 I32 old = 0;
2357
2358 if (pos_before >= data->last_start_min)
2359 b = pos_before;
2360 else
2361 b = data->last_start_min;
2362
2363 l = 0;
2364 s = SvPV(data->last_found, l);
2365 old = b - data->last_start_min;
2366
2367#else
b81d288d 2368 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2369 ? pos_before : data->last_start_min;
2370 STRLEN l;
2371 char *s = SvPV(data->last_found, l);
a0ed51b3 2372 I32 old = b - data->last_start_min;
5d1c421c 2373#endif
a0ed51b3
LW
2374
2375 if (UTF)
2376 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2377
a0ed51b3 2378 l -= old;
c277df42 2379 /* Get the added string: */
79cb57f6 2380 last_str = newSVpvn(s + old, l);
0e933229
IH
2381 if (UTF)
2382 SvUTF8_on(last_str);
c277df42
IZ
2383 if (deltanext == 0 && pos_before == b) {
2384 /* What was added is a constant string */
2385 if (mincount > 1) {
2386 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2387 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
2388 SvPVX(last_str), l, mincount - 1);
2389 SvCUR(last_str) *= mincount;
2390 /* Add additional parts. */
b81d288d 2391 SvCUR_set(data->last_found,
c277df42
IZ
2392 SvCUR(data->last_found) - l);
2393 sv_catsv(data->last_found, last_str);
0eda9292
JH
2394 {
2395 SV * sv = data->last_found;
2396 MAGIC *mg =
2397 SvUTF8(sv) && SvMAGICAL(sv) ?
2398 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2399 if (mg && mg->mg_len >= 0)
2400 mg->mg_len += CHR_SVLEN(last_str);
2401 }
c277df42
IZ
2402 data->last_end += l * (mincount - 1);
2403 }
2a8d9689
HS
2404 } else {
2405 /* start offset must point into the last copy */
2406 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2407 data->last_start_max += is_inf ? I32_MAX
2408 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2409 }
2410 }
2411 /* It is counted once already... */
2412 data->pos_min += minnext * (mincount - counted);
2413 data->pos_delta += - counted * deltanext +
2414 (minnext + deltanext) * maxcount - minnext * mincount;
2415 if (mincount != maxcount) {
653099ff
GS
2416 /* Cannot extend fixed substrings found inside
2417 the group. */
830247a4 2418 scan_commit(pRExC_state,data);
c277df42
IZ
2419 if (mincount && last_str) {
2420 sv_setsv(data->last_found, last_str);
2421 data->last_end = data->pos_min;
b81d288d 2422 data->last_start_min =
a0ed51b3 2423 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2424 data->last_start_max = is_inf
2425 ? I32_MAX
c277df42 2426 : data->pos_min + data->pos_delta
a0ed51b3 2427 - CHR_SVLEN(last_str);
c277df42
IZ
2428 }
2429 data->longest = &(data->longest_float);
2430 }
aca2d497 2431 SvREFCNT_dec(last_str);
c277df42 2432 }
405ff068 2433 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2434 data->flags |= SF_HAS_EVAL;
2435 optimize_curly_tail:
c277df42 2436 if (OP(oscan) != CURLYX) {
22c35a8c 2437 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2438 && NEXT_OFF(next))
2439 NEXT_OFF(oscan) += NEXT_OFF(next);
2440 }
c277df42 2441 continue;
653099ff 2442 default: /* REF and CLUMP only? */
c277df42 2443 if (flags & SCF_DO_SUBSTR) {
830247a4 2444 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2445 data->longest = &(data->longest_float);
2446 }
aca2d497 2447 is_inf = is_inf_internal = 1;
653099ff 2448 if (flags & SCF_DO_STCLASS_OR)
830247a4 2449 cl_anything(pRExC_state, data->start_class);
653099ff 2450 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2451 break;
2452 }
a0ed51b3 2453 }
bfed75c6 2454 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2455 int value = 0;
653099ff 2456
c277df42 2457 if (flags & SCF_DO_SUBSTR) {
830247a4 2458 scan_commit(pRExC_state,data);
c277df42
IZ
2459 data->pos_min++;
2460 }
2461 min++;
653099ff
GS
2462 if (flags & SCF_DO_STCLASS) {
2463 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2464
2465 /* Some of the logic below assumes that switching
2466 locale on will only add false positives. */
2467 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2468 case SANY:
653099ff
GS
2469 default:
2470 do_default:
2471 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2472 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2473 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2474 break;
2475 case REG_ANY:
2476 if (OP(scan) == SANY)
2477 goto do_default;
2478 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2479 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2480 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2481 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2482 }
2483 if (flags & SCF_DO_STCLASS_AND || !value)
2484 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2485 break;
2486 case ANYOF:
2487 if (flags & SCF_DO_STCLASS_AND)
2488 cl_and(data->start_class,
2489 (struct regnode_charclass_class*)scan);
2490 else
830247a4 2491 cl_or(pRExC_state, data->start_class,
653099ff
GS
2492 (struct regnode_charclass_class*)scan);
2493 break;
2494 case ALNUM:
2495 if (flags & SCF_DO_STCLASS_AND) {
2496 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2498 for (value = 0; value < 256; value++)
2499 if (!isALNUM(value))
2500 ANYOF_BITMAP_CLEAR(data->start_class, value);
2501 }
2502 }
2503 else {
2504 if (data->start_class->flags & ANYOF_LOCALE)
2505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2506 else {
2507 for (value = 0; value < 256; value++)
2508 if (isALNUM(value))
b81d288d 2509 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2510 }
2511 }
2512 break;
2513 case ALNUML:
2514 if (flags & SCF_DO_STCLASS_AND) {
2515 if (data->start_class->flags & ANYOF_LOCALE)
2516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2517 }
2518 else {
2519 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2520 data->start_class->flags |= ANYOF_LOCALE;
2521 }
2522 break;
2523 case NALNUM:
2524 if (flags & SCF_DO_STCLASS_AND) {
2525 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2527 for (value = 0; value < 256; value++)
2528 if (isALNUM(value))
2529 ANYOF_BITMAP_CLEAR(data->start_class, value);
2530 }
2531 }
2532 else {
2533 if (data->start_class->flags & ANYOF_LOCALE)
2534 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2535 else {
2536 for (value = 0; value < 256; value++)
2537 if (!isALNUM(value))
b81d288d 2538 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2539 }
2540 }
2541 break;
2542 case NALNUML:
2543 if (flags & SCF_DO_STCLASS_AND) {
2544 if (data->start_class->flags & ANYOF_LOCALE)
2545 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2546 }
2547 else {
2548 data->start_class->flags |= ANYOF_LOCALE;
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2550 }
2551 break;
2552 case SPACE:
2553 if (flags & SCF_DO_STCLASS_AND) {
2554 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2555 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2556 for (value = 0; value < 256; value++)
2557 if (!isSPACE(value))
2558 ANYOF_BITMAP_CLEAR(data->start_class, value);
2559 }
2560 }
2561 else {
2562 if (data->start_class->flags & ANYOF_LOCALE)
2563 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2564 else {
2565 for (value = 0; value < 256; value++)
2566 if (isSPACE(value))
b81d288d 2567 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2568 }
2569 }
2570 break;
2571 case SPACEL:
2572 if (flags & SCF_DO_STCLASS_AND) {
2573 if (data->start_class->flags & ANYOF_LOCALE)
2574 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2575 }
2576 else {
2577 data->start_class->flags |= ANYOF_LOCALE;
2578 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2579 }
2580 break;
2581 case NSPACE:
2582 if (flags & SCF_DO_STCLASS_AND) {
2583 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2584 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2585 for (value = 0; value < 256; value++)
2586 if (isSPACE(value))
2587 ANYOF_BITMAP_CLEAR(data->start_class, value);
2588 }
2589 }
2590 else {
2591 if (data->start_class->flags & ANYOF_LOCALE)
2592 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2593 else {
2594 for (value = 0; value < 256; value++)
2595 if (!isSPACE(value))
b81d288d 2596 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2597 }
2598 }
2599 break;
2600 case NSPACEL:
2601 if (flags & SCF_DO_STCLASS_AND) {
2602 if (data->start_class->flags & ANYOF_LOCALE) {
2603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2604 for (value = 0; value < 256; value++)
2605 if (!isSPACE(value))
2606 ANYOF_BITMAP_CLEAR(data->start_class, value);
2607 }
2608 }
2609 else {
2610 data->start_class->flags |= ANYOF_LOCALE;
2611 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2612 }
2613 break;
2614 case DIGIT:
2615 if (flags & SCF_DO_STCLASS_AND) {
2616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2617 for (value = 0; value < 256; value++)
2618 if (!isDIGIT(value))
2619 ANYOF_BITMAP_CLEAR(data->start_class, value);
2620 }
2621 else {
2622 if (data->start_class->flags & ANYOF_LOCALE)
2623 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2624 else {
2625 for (value = 0; value < 256; value++)
2626 if (isDIGIT(value))
b81d288d 2627 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2628 }
2629 }
2630 break;
2631 case NDIGIT:
2632 if (flags & SCF_DO_STCLASS_AND) {
2633 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2634 for (value = 0; value < 256; value++)
2635 if (isDIGIT(value))
2636 ANYOF_BITMAP_CLEAR(data->start_class, value);
2637 }
2638 else {
2639 if (data->start_class->flags & ANYOF_LOCALE)
2640 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2641 else {
2642 for (value = 0; value < 256; value++)
2643 if (!isDIGIT(value))
b81d288d 2644 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2645 }
2646 }
2647 break;
2648 }
2649 if (flags & SCF_DO_STCLASS_OR)
2650 cl_and(data->start_class, &and_with);
2651 flags &= ~SCF_DO_STCLASS;
2652 }
a0ed51b3 2653 }
22c35a8c 2654 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2655 data->flags |= (OP(scan) == MEOL
2656 ? SF_BEFORE_MEOL
2657 : SF_BEFORE_SEOL);
a0ed51b3 2658 }
653099ff
GS
2659 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2660 /* Lookbehind, or need to calculate parens/evals/stclass: */
2661 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2662 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2663 /* Lookahead/lookbehind */
cb434fcc 2664 I32 deltanext, minnext, fake = 0;
c277df42 2665 regnode *nscan;
653099ff
GS
2666 struct regnode_charclass_class intrnl;
2667 int f = 0;
c277df42
IZ
2668
2669 data_fake.flags = 0;
b81d288d 2670 if (data) {
2c2d71f5 2671 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2672 data_fake.last_closep = data->last_closep;
2673 }
2674 else
2675 data_fake.last_closep = &fake;
653099ff
GS
2676 if ( flags & SCF_DO_STCLASS && !scan->flags
2677 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2678 cl_init(pRExC_state, &intrnl);
653099ff 2679 data_fake.start_class = &intrnl;
e1901655 2680 f |= SCF_DO_STCLASS_AND;
653099ff 2681 }
e1901655
IZ
2682 if (flags & SCF_WHILEM_VISITED_POS)
2683 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2684 next = regnext(scan);
2685 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2686 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2687 if (scan->flags) {
2688 if (deltanext) {
9baa0206 2689 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2690 }
2691 else if (minnext > U8_MAX) {
9baa0206 2692 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2693 }
eb160463 2694 scan->flags = (U8)minnext;
c277df42
IZ
2695 }
2696 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2697 pars++;
405ff068 2698 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2699 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2700 if (data)
2701 data->whilem_c = data_fake.whilem_c;
e1901655 2702 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
2703 int was = (data->start_class->flags & ANYOF_EOS);
2704
2705 cl_and(data->start_class, &intrnl);
2706 if (was)
2707 data->start_class->flags |= ANYOF_EOS;
2708 }
a0ed51b3
LW
2709 }
2710 else if (OP(scan) == OPEN) {
c277df42 2711 pars++;
a0ed51b3 2712 }
cb434fcc 2713 else if (OP(scan) == CLOSE) {
eb160463 2714 if ((I32)ARG(scan) == is_par) {
cb434fcc 2715 next = regnext(scan);
c277df42 2716
cb434fcc
IZ
2717 if ( next && (OP(next) != WHILEM) && next < last)
2718 is_par = 0; /* Disable optimization */
2719 }
2720 if (data)
2721 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2722 }
2723 else if (OP(scan) == EVAL) {
c277df42
IZ
2724 if (data)
2725 data->flags |= SF_HAS_EVAL;
2726 }
96776eda 2727 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2728 if (flags & SCF_DO_SUBSTR) {
830247a4 2729 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2730 data->longest = &(data->longest_float);
2731 }
2732 is_inf = is_inf_internal = 1;
653099ff 2733 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2734 cl_anything(pRExC_state, data->start_class);
96776eda 2735 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2736 }
c277df42
IZ
2737 /* Else: zero-length, ignore. */
2738 scan = regnext(scan);
2739 }
2740
2741 finish:
2742 *scanp = scan;
aca2d497 2743 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2744 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2745 data->pos_delta = I32_MAX - data->pos_min;
2746 if (is_par > U8_MAX)
2747 is_par = 0;
2748 if (is_par && pars==1 && data) {
2749 data->flags |= SF_IN_PAR;
2750 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2751 }
2752 else if (pars && data) {
c277df42
IZ
2753 data->flags |= SF_HAS_PAR;
2754 data->flags &= ~SF_IN_PAR;
2755 }
653099ff
GS
2756 if (flags & SCF_DO_STCLASS_OR)
2757 cl_and(data->start_class, &and_with);
c277df42
IZ
2758 return min;
2759}
2760
76e3520e 2761STATIC I32
bfed75c6 2762S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2763{
830247a4 2764 if (RExC_rx->data) {
b81d288d
AB
2765 Renewc(RExC_rx->data,
2766 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2767 char, struct reg_data);
830247a4
IZ
2768 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2769 RExC_rx->data->count += n;
a0ed51b3
LW
2770 }
2771 else {
830247a4 2772 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2773 char, struct reg_data);
830247a4
IZ
2774 New(1208, RExC_rx->data->what, n, U8);
2775 RExC_rx->data->count = n;
c277df42 2776 }
830247a4
IZ
2777 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2778 return RExC_rx->data->count - n;
c277df42
IZ
2779}
2780
d88dccdf 2781void
864dbfa3 2782Perl_reginitcolors(pTHX)
d88dccdf 2783{
d88dccdf
IZ
2784 int i = 0;
2785 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 2786
d88dccdf
IZ
2787 if (s) {
2788 PL_colors[0] = s = savepv(s);
2789 while (++i < 6) {
2790 s = strchr(s, '\t');
2791 if (s) {
2792 *s = '\0';
2793 PL_colors[i] = ++s;
2794 }
2795 else
c712d376 2796 PL_colors[i] = s = "";
d88dccdf
IZ
2797 }
2798 } else {
b81d288d 2799 while (i < 6)
d88dccdf
IZ
2800 PL_colors[i++] = "";
2801 }
2802 PL_colorset = 1;
2803}
2804
8615cb43 2805
a687059c 2806/*
e50aee73 2807 - pregcomp - compile a regular expression into internal code
a687059c
LW
2808 *
2809 * We can't allocate space until we know how big the compiled form will be,
2810 * but we can't compile it (and thus know how big it is) until we've got a
2811 * place to put the code. So we cheat: we compile it twice, once with code
2812 * generation turned off and size counting turned on, and once "for real".
2813 * This also means that we don't allocate space until we are sure that the
2814 * thing really will compile successfully, and we never have to move the
2815 * code and thus invalidate pointers into it. (Note that it has to be in
2816 * one piece because free() must be able to free it all.) [NB: not true in perl]
2817 *
2818 * Beware that the optimization-preparation code in here knows about some
2819 * of the structure of the compiled regexp. [I'll say.]
2820 */
2821regexp *
864dbfa3 2822Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2823{
a0d0e21e 2824 register regexp *r;
c277df42 2825 regnode *scan;
c277df42 2826 regnode *first;
a0d0e21e 2827 I32 flags;
a0d0e21e
LW
2828 I32 minlen = 0;
2829 I32 sawplus = 0;
2830 I32 sawopen = 0;
2c2d71f5 2831 scan_data_t data;
830247a4
IZ
2832 RExC_state_t RExC_state;
2833 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2834
a3621e74
YO
2835 GET_RE_DEBUG_FLAGS_DECL;
2836
a0d0e21e 2837 if (exp == NULL)
c277df42 2838 FAIL("NULL regexp argument");
a0d0e21e 2839
a5961de5 2840 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2841
5cfc7842 2842 RExC_precomp = exp;
a3621e74
YO
2843 DEBUG_r(if (!PL_colorset) reginitcolors());
2844 DEBUG_COMPILE_r({
2845 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2846 PL_colors[4],PL_colors[5],PL_colors[0],
2847 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2848 });
e2509266 2849 RExC_flags = pm->op_pmflags;
830247a4 2850 RExC_sawback = 0;
bbce6d69 2851
830247a4
IZ
2852 RExC_seen = 0;
2853 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2854 RExC_seen_evals = 0;
2855 RExC_extralen = 0;
c277df42 2856
bbce6d69 2857 /* First pass: determine size, legality. */
830247a4 2858 RExC_parse = exp;
fac92740 2859 RExC_start = exp;
830247a4
IZ
2860 RExC_end = xend;
2861 RExC_naughty = 0;
2862 RExC_npar = 1;
2863 RExC_size = 0L;
2864 RExC_emit = &PL_regdummy;
2865 RExC_whilem_seen = 0;
85ddcde9
JH
2866#if 0 /* REGC() is (currently) a NOP at the first pass.
2867 * Clever compilers notice this and complain. --jhi */
830247a4 2868 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2869#endif
830247a4 2870 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2871 RExC_precomp = Nullch;
a0d0e21e
LW
2872 return(NULL);
2873 }
a3621e74 2874 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2875
c277df42
IZ
2876 /* Small enough for pointer-storage convention?
2877 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2878 if (RExC_size >= 0x10000L && RExC_extralen)
2879 RExC_size += RExC_extralen;
c277df42 2880 else
830247a4
IZ
2881 RExC_extralen = 0;
2882 if (RExC_whilem_seen > 15)
2883 RExC_whilem_seen = 15;
a0d0e21e 2884
bbce6d69 2885 /* Allocate space and initialize. */
830247a4 2886 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2887 char, regexp);
a0d0e21e 2888 if (r == NULL)
b45f050a
JF
2889 FAIL("Regexp out of space");
2890
0f79a09d
GS
2891#ifdef DEBUGGING
2892 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2893 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2894#endif
c277df42 2895 r->refcnt = 1;
bbce6d69 2896 r->prelen = xend - exp;
5cfc7842 2897 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2898 r->subbeg = NULL;
ed252734
NC
2899#ifdef PERL_COPY_ON_WRITE
2900 r->saved_copy = Nullsv;
2901#endif
cf93c79d 2902 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2903 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2904
2905 r->substrs = 0; /* Useful during FAIL. */
2906 r->startp = 0; /* Useful during FAIL. */
2907 r->endp = 0; /* Useful during FAIL. */
2908
fac92740
MJD
2909 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2910 if (r->offsets) {
2911 r->offsets[0] = RExC_size;
2912 }
a3621e74 2913 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
392fbf5d 2914 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 2915 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2916 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2917
830247a4 2918 RExC_rx = r;
bbce6d69 2919
2920 /* Second pass: emit code. */
e2509266 2921 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2922 RExC_parse = exp;
2923 RExC_end = xend;
2924 RExC_naughty = 0;
2925 RExC_npar = 1;
fac92740 2926 RExC_emit_start = r->program;
830247a4 2927 RExC_emit = r->program;
2cd61cdb 2928 /* Store the count of eval-groups for security checks: */
eb160463 2929 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2930 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2931 r->data = 0;
830247a4 2932 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2933 return(NULL);
2934
a3621e74 2935
a0d0e21e 2936 /* Dig out information for optimizations. */
cf93c79d 2937 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2938 pm->op_pmflags = RExC_flags;
a0ed51b3 2939 if (UTF)
5ff6fc6d 2940 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2941 r->regstclass = NULL;
830247a4 2942 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2943 r->reganch |= ROPT_NAUGHTY;
c277df42 2944 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2945
2946 /* XXXX To minimize changes to RE engine we always allocate
2947 3-units-long substrs field. */
2948 Newz(1004, r->substrs, 1, struct reg_substr_data);
2949
2c2d71f5 2950 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2951 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2952 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2953 I32 fake;
c5254dd6 2954 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2955 struct regnode_charclass_class ch_class;
2956 int stclass_flag;
cb434fcc 2957 I32 last_close = 0;
a0d0e21e
LW
2958
2959 first = scan;
c277df42 2960 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2961 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2962 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2963 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2964 (OP(first) == PLUS) ||
2965 (OP(first) == MINMOD) ||
653099ff 2966 /* An {n,m} with n>0 */
22c35a8c 2967 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2968 if (OP(first) == PLUS)
2969 sawplus = 1;
2970 else
2971 first += regarglen[(U8)OP(first)];
2972 first = NEXTOPER(first);
a687059c
LW
2973 }
2974
a0d0e21e
LW
2975 /* Starting-point info. */
2976 again:
653099ff 2977 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2978 if (OP(first) == EXACT)
2979 ; /* Empty, get anchored substr later. */
2980 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2981 r->regstclass = first;
2982 }
bfed75c6 2983 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2984 r->regstclass = first;
22c35a8c
GS
2985 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2986 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2987 r->regstclass = first;
22c35a8c 2988 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2989 r->reganch |= (OP(first) == MBOL
2990 ? ROPT_ANCH_MBOL
2991 : (OP(first) == SBOL
2992 ? ROPT_ANCH_SBOL
2993 : ROPT_ANCH_BOL));
a0d0e21e 2994 first = NEXTOPER(first);
774d564b 2995 goto again;
2996 }
2997 else if (OP(first) == GPOS) {
2998 r->reganch |= ROPT_ANCH_GPOS;
2999 first = NEXTOPER(first);
3000 goto again;
a0d0e21e 3001 }
e09294f4 3002 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 3003 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3004 !(r->reganch & ROPT_ANCH) )
3005 {
3006 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
3007 int type = OP(NEXTOPER(first));
3008
ffc61ed2 3009 if (type == REG_ANY)
cad2e5aa
JH
3010 type = ROPT_ANCH_MBOL;
3011 else
3012 type = ROPT_ANCH_SBOL;
3013
3014 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3015 first = NEXTOPER(first);
774d564b 3016 goto again;
a0d0e21e 3017 }
b81d288d 3018 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3019 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3020 /* x+ must match at the 1st pos of run of x's */
3021 r->reganch |= ROPT_SKIP;
a0d0e21e 3022
c277df42 3023 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3024 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3025 (IV)(first - scan + 1)));
a0d0e21e
LW
3026 /*
3027 * If there's something expensive in the r.e., find the
3028 * longest literal string that must appear and make it the
3029 * regmust. Resolve ties in favor of later strings, since
3030 * the regstart check works with the beginning of the r.e.
3031 * and avoiding duplication strengthens checking. Not a
3032 * strong reason, but sufficient in the absence of others.
3033 * [Now we resolve ties in favor of the earlier string if
c277df42 3034 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3035 * earlier string may buy us something the later one won't.]
3036 */
a0d0e21e 3037 minlen = 0;
a687059c 3038
79cb57f6
GS
3039 data.longest_fixed = newSVpvn("",0);
3040 data.longest_float = newSVpvn("",0);
3041 data.last_found = newSVpvn("",0);
c277df42
IZ
3042 data.longest = &(data.longest_fixed);
3043 first = scan;
653099ff 3044 if (!r->regstclass) {
830247a4 3045 cl_init(pRExC_state, &ch_class);
653099ff
GS
3046 data.start_class = &ch_class;
3047 stclass_flag = SCF_DO_STCLASS_AND;
3048 } else /* XXXX Check for BOUND? */
3049 stclass_flag = 0;
cb434fcc 3050 data.last_closep = &last_close;
653099ff 3051
830247a4 3052 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3053 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3054 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3055 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3056 && !RExC_seen_zerolen
3057 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3058 r->reganch |= ROPT_CHECK_ALL;
830247a4 3059 scan_commit(pRExC_state, &data);
c277df42
IZ
3060 SvREFCNT_dec(data.last_found);
3061
a0ed51b3 3062 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3063 if (longest_float_length
c277df42
IZ
3064 || (data.flags & SF_FL_BEFORE_EOL
3065 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3066 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3067 int t;
3068
a0ed51b3 3069 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3070 && data.offset_fixed == data.offset_float_min
3071 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3072 goto remove_float; /* As in (a)+. */
3073
33b8afdf
JH
3074 if (SvUTF8(data.longest_float)) {
3075 r->float_utf8 = data.longest_float;
3076 r->float_substr = Nullsv;
3077 } else {
3078 r->float_substr = data.longest_float;
3079 r->float_utf8 = Nullsv;
3080 }
c277df42
IZ
3081 r->float_min_offset = data.offset_float_min;
3082 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3083 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3084 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3085 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3086 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3087 }
3088 else {
aca2d497 3089 remove_float:
33b8afdf 3090 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3091 SvREFCNT_dec(data.longest_float);
c5254dd6 3092 longest_float_length = 0;
a0d0e21e 3093 }
c277df42 3094
a0ed51b3 3095 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3096 if (longest_fixed_length
c277df42
IZ
3097 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3099 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3100 int t;
3101
33b8afdf
JH
3102 if (SvUTF8(data.longest_fixed)) {
3103 r->anchored_utf8 = data.longest_fixed;
3104 r->anchored_substr = Nullsv;
3105 } else {
3106 r->anchored_substr = data.longest_fixed;
3107 r->anchored_utf8 = Nullsv;
3108 }
c277df42 3109 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3110 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3111 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3112 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3113 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3114 }
3115 else {
33b8afdf 3116 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3117 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3118 longest_fixed_length = 0;
a0d0e21e 3119 }
b81d288d 3120 if (r->regstclass
ffc61ed2 3121 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3122 r->regstclass = NULL;
33b8afdf
JH
3123 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3124 && stclass_flag
653099ff 3125 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3126 && !cl_is_anything(data.start_class))
3127 {
830247a4 3128 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3129
b81d288d 3130 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3131 struct regnode_charclass_class);
3132 StructCopy(data.start_class,
830247a4 3133 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3134 struct regnode_charclass_class);
830247a4 3135 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3136 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3137 PL_regdata = r->data; /* for regprop() */
a3621e74 3138 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3139 regprop(sv, (regnode*)data.start_class);
3140 PerlIO_printf(Perl_debug_log,
3141 "synthetic stclass `%s'.\n",
3142 SvPVX(sv));});
653099ff 3143 }
c277df42
IZ
3144
3145 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3146 if (longest_fixed_length > longest_float_length) {
c277df42 3147 r->check_substr = r->anchored_substr;
33b8afdf 3148 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3149 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3150 if (r->reganch & ROPT_ANCH_SINGLE)
3151 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3152 }
3153 else {
c277df42 3154 r->check_substr = r->float_substr;
33b8afdf 3155 r->check_utf8 = r->float_utf8;
c277df42
IZ
3156 r->check_offset_min = data.offset_float_min;
3157 r->check_offset_max = data.offset_float_max;
a0d0e21e 3158 }
30382c73
IZ
3159 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3160 This should be changed ASAP! */
33b8afdf 3161 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3162 r->reganch |= RE_USE_INTUIT;
33b8afdf 3163 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3164 r->reganch |= RE_INTUIT_TAIL;
3165 }
a0ed51b3
LW
3166 }
3167 else {
c277df42
IZ
3168 /* Several toplevels. Best we can is to set minlen. */
3169 I32 fake;
653099ff 3170 struct regnode_charclass_class ch_class;
cb434fcc 3171 I32 last_close = 0;
c277df42 3172
a3621e74 3173 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3174 scan = r->program + 1;
830247a4 3175 cl_init(pRExC_state, &ch_class);
653099ff 3176 data.start_class = &ch_class;
cb434fcc 3177 data.last_closep = &last_close;
a3621e74 3178 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3179 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3180 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3181 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3182 && !cl_is_anything(data.start_class))
3183 {
830247a4 3184 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3185
b81d288d 3186 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3187 struct regnode_charclass_class);
3188 StructCopy(data.start_class,
830247a4 3189 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3190 struct regnode_charclass_class);
830247a4 3191 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3192 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3193 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3194 regprop(sv, (regnode*)data.start_class);
3195 PerlIO_printf(Perl_debug_log,
3196 "synthetic stclass `%s'.\n",
3197 SvPVX(sv));});
653099ff 3198 }
a0d0e21e
LW
3199 }
3200
a0d0e21e 3201 r->minlen = minlen;
b81d288d 3202 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3203 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3204 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3205 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3206 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3207 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3208 if (RExC_seen & REG_SEEN_CANY)
3209 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3210 Newz(1002, r->startp, RExC_npar, I32);
3211 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3212 PL_regdata = r->data; /* for regprop() */
a3621e74 3213 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3214 return(r);
a687059c
LW
3215}
3216
3217/*
3218 - reg - regular expression, i.e. main body or parenthesized thing
3219 *
3220 * Caller must absorb opening parenthesis.
3221 *
3222 * Combining parenthesis handling with the base level of regular expression
3223 * is a trifle forced, but the need to tie the tails of the branches to what
3224 * follows makes it hard to avoid.
3225 */
76e3520e 3226STATIC regnode *
830247a4 3227S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3228 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3229{
c277df42
IZ
3230 register regnode *ret; /* Will be the head of the group. */
3231 register regnode *br;
3232 register regnode *lastbr;
3233 register regnode *ender = 0;
a0d0e21e 3234 register I32 parno = 0;
e2509266 3235 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3236
3237 /* for (?g), (?gc), and (?o) warnings; warning
3238 about (?c) will warn about (?g) -- japhy */
3239
3240 I32 wastedflags = 0x00,
3241 wasted_o = 0x01,
3242 wasted_g = 0x02,
3243 wasted_gc = 0x02 | 0x04,
3244 wasted_c = 0x04;
3245
fac92740 3246 char * parse_start = RExC_parse; /* MJD */
830247a4 3247 char *oregcomp_parse = RExC_parse;
c277df42 3248 char c;
a0d0e21e 3249
821b33a5 3250 *flagp = 0; /* Tentatively. */
a0d0e21e 3251
9d1d55b5 3252
a0d0e21e
LW
3253 /* Make an OPEN node, if parenthesized. */
3254 if (paren) {
fac92740 3255 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3256 U32 posflags = 0, negflags = 0;
3257 U32 *flagsp = &posflags;
0f5d15d6 3258 int logical = 0;
830247a4 3259 char *seqstart = RExC_parse;
ca9dfc88 3260
830247a4
IZ
3261 RExC_parse++;
3262 paren = *RExC_parse++;
c277df42 3263 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3264 switch (paren) {
fac92740 3265 case '<': /* (?<...) */
830247a4 3266 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3267 if (*RExC_parse == '!')
c277df42 3268 paren = ',';
b81d288d 3269 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3270 goto unknown;
830247a4 3271 RExC_parse++;
fac92740
MJD
3272 case '=': /* (?=...) */
3273 case '!': /* (?!...) */
830247a4 3274 RExC_seen_zerolen++;
fac92740
MJD
3275 case ':': /* (?:...) */
3276 case '>': /* (?>...) */
a0d0e21e 3277 break;
fac92740
MJD
3278 case '$': /* (?$...) */
3279 case '@': /* (?@...) */
8615cb43 3280 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3281 break;
fac92740 3282 case '#': /* (?#...) */
830247a4
IZ
3283 while (*RExC_parse && *RExC_parse != ')')
3284 RExC_parse++;
3285 if (*RExC_parse != ')')
c277df42 3286 FAIL("Sequence (?#... not terminated");
830247a4 3287 nextchar(pRExC_state);
a0d0e21e
LW
3288 *flagp = TRYAGAIN;
3289 return NULL;
fac92740 3290 case 'p': /* (?p...) */
9014280d 3291 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3292 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3293 /* FALL THROUGH*/
fac92740 3294 case '?': /* (??...) */
0f5d15d6 3295 logical = 1;
438a3801
YST
3296 if (*RExC_parse != '{')
3297 goto unknown;
830247a4 3298 paren = *RExC_parse++;
0f5d15d6 3299 /* FALL THROUGH */
fac92740 3300 case '{': /* (?{...}) */
c277df42 3301 {
c277df42
IZ
3302 I32 count = 1, n = 0;
3303 char c;
830247a4 3304 char *s = RExC_parse;
c277df42
IZ
3305 SV *sv;
3306 OP_4tree *sop, *rop;
3307
830247a4
IZ
3308 RExC_seen_zerolen++;
3309 RExC_seen |= REG_SEEN_EVAL;
3310 while (count && (c = *RExC_parse)) {
3311 if (c == '\\' && RExC_parse[1])
3312 RExC_parse++;
b81d288d 3313 else if (c == '{')
c277df42 3314 count++;
b81d288d 3315 else if (c == '}')
c277df42 3316 count--;
830247a4 3317 RExC_parse++;
c277df42 3318 }
830247a4 3319 if (*RExC_parse != ')')
b45f050a 3320 {
b81d288d 3321 RExC_parse = s;
b45f050a
JF
3322 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3323 }
c277df42 3324 if (!SIZE_ONLY) {
f3548bdc 3325 PAD *pad;
b81d288d
AB
3326
3327 if (RExC_parse - 1 - s)
830247a4 3328 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3329 else
79cb57f6 3330 sv = newSVpvn("", 0);
c277df42 3331
569233ed
SB
3332 ENTER;
3333 Perl_save_re_context(aTHX);
f3548bdc 3334 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3335 sop->op_private |= OPpREFCOUNTED;
3336 /* re_dup will OpREFCNT_inc */
3337 OpREFCNT_set(sop, 1);
569233ed 3338 LEAVE;
c277df42 3339
830247a4
IZ
3340 n = add_data(pRExC_state, 3, "nop");
3341 RExC_rx->data->data[n] = (void*)rop;
3342 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3343 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3344 SvREFCNT_dec(sv);
a0ed51b3 3345 }
e24b16f9 3346 else { /* First pass */
830247a4 3347 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3348 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3349 /* No compiled RE interpolated, has runtime
3350 components ===> unsafe. */
3351 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3352 if (PL_tainting && PL_tainted)
cc6b7395 3353 FAIL("Eval-group in insecure regular expression");
923e4eb5 3354 if (IN_PERL_COMPILETIME)
b5c19bd7 3355 PL_cv_has_eval = 1;
c277df42 3356 }
b5c19bd7 3357
830247a4 3358 nextchar(pRExC_state);
0f5d15d6 3359 if (logical) {
830247a4 3360 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3361 if (!SIZE_ONLY)
3362 ret->flags = 2;
830247a4 3363 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3364 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3365 return ret;
3366 }
ccb2c380
MP
3367 ret = reganode(pRExC_state, EVAL, n);
3368 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3369 Set_Node_Offset(ret, parse_start);
3370 return ret;
c277df42 3371 }
fac92740 3372 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3373 {
fac92740 3374 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3375 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3376 || RExC_parse[1] == '<'
830247a4 3377 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3378 I32 flag;
3379
830247a4 3380 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3381 if (!SIZE_ONLY)
3382 ret->flags = 1;
830247a4 3383 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3384 goto insert_if;
b81d288d 3385 }
a0ed51b3 3386 }
830247a4 3387 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3388 /* (?(1)...) */
830247a4 3389 parno = atoi(RExC_parse++);
c277df42 3390
830247a4
IZ
3391 while (isDIGIT(*RExC_parse))
3392 RExC_parse++;
fac92740
MJD
3393 ret = reganode(pRExC_state, GROUPP, parno);
3394
830247a4 3395 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3396 vFAIL("Switch condition not recognized");
c277df42 3397 insert_if:
830247a4
IZ
3398 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3399 br = regbranch(pRExC_state, &flags, 1);
c277df42 3400 if (br == NULL)
830247a4 3401 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3402 else
830247a4
IZ
3403 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3404 c = *nextchar(pRExC_state);
d1b80229
IZ
3405 if (flags&HASWIDTH)
3406 *flagp |= HASWIDTH;
c277df42 3407 if (c == '|') {
830247a4
IZ
3408 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3409 regbranch(pRExC_state, &flags, 1);
3410 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3411 if (flags&HASWIDTH)
3412 *flagp |= HASWIDTH;
830247a4 3413 c = *nextchar(pRExC_state);
a0ed51b3
LW
3414 }
3415 else
c277df42
IZ
3416 lastbr = NULL;
3417 if (c != ')')
8615cb43 3418 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3419 ender = reg_node(pRExC_state, TAIL);
3420 regtail(pRExC_state, br, ender);
c277df42 3421 if (lastbr) {
830247a4
IZ
3422 regtail(pRExC_state, lastbr, ender);
3423 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3424 }
3425 else
830247a4 3426 regtail(pRExC_state, ret, ender);
c277df42 3427 return ret;
a0ed51b3
LW
3428 }
3429 else {
830247a4 3430 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3431 }
3432 }
1b1626e4 3433 case 0:
830247a4 3434 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3435 vFAIL("Sequence (? incomplete");
1b1626e4 3436 break;
a0d0e21e 3437 default:
830247a4 3438 --RExC_parse;
fac92740 3439 parse_flags: /* (?i) */
830247a4 3440 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3441 /* (?g), (?gc) and (?o) are useless here
3442 and must be globally applied -- japhy */
3443
3444 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3445 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3446 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3447 if (! (wastedflags & wflagbit) ) {
3448 wastedflags |= wflagbit;
3449 vWARN5(
3450 RExC_parse + 1,
3451 "Useless (%s%c) - %suse /%c modifier",
3452 flagsp == &negflags ? "?-" : "?",
3453 *RExC_parse,
3454 flagsp == &negflags ? "don't " : "",
3455 *RExC_parse
3456 );
3457 }
3458 }
3459 }
3460 else if (*RExC_parse == 'c') {
3461 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3462 if (! (wastedflags & wasted_c) ) {
3463 wastedflags |= wasted_gc;
3464 vWARN3(
3465 RExC_parse + 1,
3466 "Useless (%sc) - %suse /gc modifier",
3467 flagsp == &negflags ? "?-" : "?",
3468 flagsp == &negflags ? "don't " : ""
3469 );
3470 }
3471 }
3472 }
3473 else { pmflag(flagsp, *RExC_parse); }
3474
830247a4 3475 ++RExC_parse;
ca9dfc88 3476 }
830247a4 3477 if (*RExC_parse == '-') {
ca9dfc88 3478 flagsp = &negflags;
9d1d55b5 3479 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3480 ++RExC_parse;
ca9dfc88 3481 goto parse_flags;
48c036b1 3482 }
e2509266
JH
3483 RExC_flags |= posflags;
3484 RExC_flags &= ~negflags;
830247a4
IZ
3485 if (*RExC_parse == ':') {
3486 RExC_parse++;
ca9dfc88
IZ
3487 paren = ':';
3488 break;
3489 }
c277df42 3490 unknown:
830247a4
IZ
3491 if (*RExC_parse != ')') {
3492 RExC_parse++;
3493 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3494 }
830247a4 3495 nextchar(pRExC_state);
a0d0e21e
LW
3496 *flagp = TRYAGAIN;
3497 return NULL;
3498 }
3499 }
fac92740 3500 else { /* (...) */
830247a4
IZ
3501 parno = RExC_npar;
3502 RExC_npar++;
3503 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3504 Set_Node_Length(ret, 1); /* MJD */
3505 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3506 open = 1;
a0d0e21e 3507 }
a0ed51b3 3508 }
fac92740 3509 else /* ! paren */
a0d0e21e
LW
3510 ret = NULL;
3511
3512 /* Pick up the branches, linking them together. */
fac92740 3513 parse_start = RExC_parse; /* MJD */
830247a4 3514 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
3515 /* branch_len = (paren != 0); */
3516
a0d0e21e
LW
3517 if (br == NULL)
3518 return(NULL);
830247a4
IZ
3519 if (*RExC_parse == '|') {
3520 if (!SIZE_ONLY && RExC_extralen) {
3521 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3522 }
fac92740 3523 else { /* MJD */
830247a4 3524 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3525 Set_Node_Length(br, paren != 0);
3526 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3527 }
c277df42
IZ
3528 have_branch = 1;
3529 if (SIZE_ONLY)
830247a4 3530 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3531 }
3532 else if (paren == ':') {
c277df42
IZ
3533 *flagp |= flags&SIMPLE;
3534 }
3535 if (open) { /* Starts with OPEN. */
830247a4 3536 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3537 }
3538 else if (paren != '?') /* Not Conditional */
a0d0e21e 3539 ret = br;
32a0ca98 3540 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3541 lastbr = br;
830247a4
IZ
3542 while (*RExC_parse == '|') {
3543 if (!SIZE_ONLY && RExC_extralen) {
3544 ender = reganode(pRExC_state, LONGJMP,0);
3545 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3546 }
3547 if (SIZE_ONLY)
830247a4
IZ
3548 RExC_extralen += 2; /* Account for LONGJMP. */
3549 nextchar(pRExC_state);
3550 br = regbranch(pRExC_state, &flags, 0);
fac92740 3551
a687059c 3552 if (br == NULL)
a0d0e21e 3553 return(NULL);
830247a4 3554 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3555 lastbr = br;
821b33a5
IZ
3556 if (flags&HASWIDTH)
3557 *flagp |= HASWIDTH;
a687059c 3558 *flagp |= flags&SPSTART;
a0d0e21e
LW
3559 }
3560
c277df42
IZ
3561 if (have_branch || paren != ':') {
3562 /* Make a closing node, and hook it on the end. */
3563 switch (paren) {
3564 case ':':
830247a4 3565 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3566 break;
3567 case 1:
830247a4 3568 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3569 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3570 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3571 break;
3572 case '<':
c277df42
IZ
3573 case ',':
3574 case '=':
3575 case '!':
c277df42 3576 *flagp &= ~HASWIDTH;
821b33a5
IZ
3577 /* FALL THROUGH */
3578 case '>':
830247a4 3579 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3580 break;
3581 case 0:
830247a4 3582 ender = reg_node(pRExC_state, END);
c277df42
IZ
3583 break;
3584 }
830247a4 3585 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3586
c277df42
IZ
3587 if (have_branch) {
3588 /* Hook the tails of the branches to the closing node. */
3589 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3590 regoptail(pRExC_state, br, ender);
c277df42
IZ
3591 }
3592 }
a0d0e21e 3593 }
c277df42
IZ
3594
3595 {
bfed75c6
AL
3596 const char *p;
3597 static const char parens[] = "=!<,>";
c277df42
IZ
3598
3599 if (paren && (p = strchr(parens, paren))) {
eb160463 3600 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3601 int flag = (p - parens) > 1;
3602
3603 if (paren == '>')
3604 node = SUSPEND, flag = 0;
830247a4 3605 reginsert(pRExC_state, node,ret);
45948336
EP
3606 Set_Node_Cur_Length(ret);
3607 Set_Node_Offset(ret, parse_start + 1);
c277df42 3608 ret->flags = flag;
830247a4 3609 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3610 }
a0d0e21e
LW
3611 }
3612
3613 /* Check for proper termination. */
ce3e6498 3614 if (paren) {
e2509266 3615 RExC_flags = oregflags;
830247a4
IZ
3616 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3617 RExC_parse = oregcomp_parse;
380a0633 3618 vFAIL("Unmatched (");
ce3e6498 3619 }
a0ed51b3 3620 }
830247a4
IZ
3621 else if (!paren && RExC_parse < RExC_end) {
3622 if (*RExC_parse == ')') {
3623 RExC_parse++;
380a0633 3624 vFAIL("Unmatched )");
a0ed51b3
LW
3625 }
3626 else
b45f050a 3627 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3628 /* NOTREACHED */
3629 }
a687059c 3630
a0d0e21e 3631 return(ret);
a687059c
LW
3632}
3633
3634/*
3635 - regbranch - one alternative of an | operator
3636 *
3637 * Implements the concatenation operator.
3638 */
76e3520e 3639STATIC regnode *
830247a4 3640S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3641{
c277df42
IZ
3642 register regnode *ret;
3643 register regnode *chain = NULL;
3644 register regnode *latest;
3645 I32 flags = 0, c = 0;
a0d0e21e 3646
b81d288d 3647 if (first)
c277df42
IZ
3648 ret = NULL;
3649 else {
b81d288d 3650 if (!SIZE_ONLY && RExC_extralen)
830247a4 3651 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3652 else {
830247a4 3653 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3654 Set_Node_Length(ret, 1);
3655 }
c277df42
IZ
3656 }
3657
b81d288d 3658 if (!first && SIZE_ONLY)
830247a4 3659 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3660
c277df42 3661 *flagp = WORST; /* Tentatively. */
a0d0e21e 3662
830247a4
IZ
3663 RExC_parse--;
3664 nextchar(pRExC_state);
3665 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3666 flags &= ~TRYAGAIN;
830247a4 3667 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3668 if (latest == NULL) {
3669 if (flags & TRYAGAIN)
3670 continue;
3671 return(NULL);
a0ed51b3
LW
3672 }
3673 else if (ret == NULL)
c277df42 3674 ret = latest;
a0d0e21e 3675 *flagp |= flags&HASWIDTH;
c277df42 3676 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3677 *flagp |= flags&SPSTART;
3678 else {
830247a4
IZ
3679 RExC_naughty++;
3680 regtail(pRExC_state, chain, latest);
a687059c 3681 }
a0d0e21e 3682 chain = latest;
c277df42
IZ
3683 c++;
3684 }
3685 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3686 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3687 if (ret == NULL)
3688 ret = chain;
3689 }
3690 if (c == 1) {
3691 *flagp |= flags&SIMPLE;
a0d0e21e 3692 }
a687059c 3693
a0d0e21e 3694 return(ret);
a687059c
LW
3695}
3696
3697/*
3698 - regpiece - something followed by possible [*+?]
3699 *
3700 * Note that the branching code sequences used for ? and the general cases
3701 * of * and + are somewhat optimized: they use the same NOTHING node as
3702 * both the endmarker for their branch list and the body of the last branch.
3703 * It might seem that this node could be dispensed with entirely, but the
3704 * endmarker role is not redundant.
3705 */
76e3520e 3706STATIC regnode *
830247a4 3707S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3708{
c277df42 3709 register regnode *ret;
a0d0e21e
LW
3710 register char op;
3711 register char *next;
3712 I32 flags;
830247a4 3713 char *origparse = RExC_parse;
a0d0e21e
LW
3714 char *maxpos;
3715 I32 min;
c277df42 3716 I32 max = REG_INFTY;
fac92740 3717