This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Why hide functions (and duplicate them) when you can just avoid
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
c5be433b
GS
51
52# define PERL_NO_GET_CONTEXT
b81d288d 53#endif
56953603 54
a687059c 55/*
e50aee73 56 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
57 *
58 * Copyright (c) 1986 by University of Toronto.
59 * Written by Henry Spencer. Not derived from licensed software.
60 *
61 * Permission is granted to anyone to use this software for any
62 * purpose on any computer system, and to redistribute it freely,
63 * subject to the following restrictions:
64 *
65 * 1. The author is not responsible for the consequences of use of
66 * this software, no matter how awful, even if they arise
67 * from defects in it.
68 *
69 * 2. The origin of this software must not be misrepresented, either
70 * by explicit claim or by omission.
71 *
72 * 3. Altered versions must be plainly marked as such, and must not
73 * be misrepresented as being the original software.
74 *
75 *
76 **** Alterations to Henry's code are...
77 ****
4bb101f2 78 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 79 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 80 ****
9ef589d8
LW
81 **** You may distribute under the terms of either the GNU General Public
82 **** License or the Artistic License, as specified in the README file.
83
a687059c
LW
84 *
85 * Beware that some of this code is subtly aware of the way operator
86 * precedence is structured in regular expressions. Serious changes in
87 * regular-expression syntax might require a total rethink.
88 */
89#include "EXTERN.h"
864dbfa3 90#define PERL_IN_REGCOMP_C
a687059c 91#include "perl.h"
d06ea78c 92
acfe0abc 93#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
94# include "INTERN.h"
95#endif
c277df42
IZ
96
97#define REG_COMP_C
a687059c
LW
98#include "regcomp.h"
99
d4cce5f1 100#ifdef op
11343788 101#undef op
d4cce5f1 102#endif /* op */
11343788 103
fe14fcc3 104#ifdef MSDOS
7e4e8c89 105# if defined(BUGGY_MSC6)
fe14fcc3 106 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 107# pragma optimize("a",off)
fe14fcc3 108 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
109# pragma optimize("w",on )
110# endif /* BUGGY_MSC6 */
fe14fcc3
LW
111#endif /* MSDOS */
112
a687059c
LW
113#ifndef STATIC
114#define STATIC static
115#endif
116
830247a4 117typedef struct RExC_state_t {
e2509266 118 U32 flags; /* are we folding, multilining? */
830247a4
IZ
119 char *precomp; /* uncompiled string. */
120 regexp *rx;
fac92740 121 char *start; /* Start of input for compile */
830247a4
IZ
122 char *end; /* End of input for compile */
123 char *parse; /* Input-scan pointer. */
124 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 125 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 126 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
129 U32 seen;
130 I32 size; /* Code size. */
131 I32 npar; /* () count. */
132 I32 extralen;
133 I32 seen_zerolen;
134 I32 seen_evals;
1aa99e6b 135 I32 utf8;
830247a4
IZ
136#if ADD_TO_REGEXEC
137 char *starttry; /* -Dr: where regtry was called. */
138#define RExC_starttry (pRExC_state->starttry)
139#endif
140} RExC_state_t;
141
e2509266 142#define RExC_flags (pRExC_state->flags)
830247a4
IZ
143#define RExC_precomp (pRExC_state->precomp)
144#define RExC_rx (pRExC_state->rx)
fac92740 145#define RExC_start (pRExC_state->start)
830247a4
IZ
146#define RExC_end (pRExC_state->end)
147#define RExC_parse (pRExC_state->parse)
148#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 149#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 150#define RExC_emit (pRExC_state->emit)
fac92740 151#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
152#define RExC_naughty (pRExC_state->naughty)
153#define RExC_sawback (pRExC_state->sawback)
154#define RExC_seen (pRExC_state->seen)
155#define RExC_size (pRExC_state->size)
156#define RExC_npar (pRExC_state->npar)
157#define RExC_extralen (pRExC_state->extralen)
158#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 160#define RExC_utf8 (pRExC_state->utf8)
830247a4 161
a687059c
LW
162#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
163#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
164 ((*s) == '{' && regcurly(s)))
a687059c 165
35c8bce7
LW
166#ifdef SPSTART
167#undef SPSTART /* dratted cpp namespace... */
168#endif
a687059c
LW
169/*
170 * Flags to be passed up and down.
171 */
a687059c 172#define WORST 0 /* Worst case. */
821b33a5 173#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
174#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
175#define SPSTART 0x4 /* Starts with * or +. */
176#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 177
2c2d71f5
JH
178/* Length of a variant. */
179
180typedef struct scan_data_t {
181 I32 len_min;
182 I32 len_delta;
183 I32 pos_min;
184 I32 pos_delta;
185 SV *last_found;
186 I32 last_end; /* min value, <0 unless valid. */
187 I32 last_start_min;
188 I32 last_start_max;
189 SV **longest; /* Either &l_fixed, or &l_float. */
190 SV *longest_fixed;
191 I32 offset_fixed;
192 SV *longest_float;
193 I32 offset_float_min;
194 I32 offset_float_max;
195 I32 flags;
196 I32 whilem_c;
cb434fcc 197 I32 *last_closep;
653099ff 198 struct regnode_charclass_class *start_class;
2c2d71f5
JH
199} scan_data_t;
200
a687059c 201/*
e50aee73 202 * Forward declarations for pregcomp()'s friends.
a687059c 203 */
a0d0e21e 204
27da23d5
JH
205static const scan_data_t zero_scan_data =
206 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
207
208#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
209#define SF_BEFORE_SEOL 0x1
210#define SF_BEFORE_MEOL 0x2
211#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
212#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213
09b7f37c
CB
214#ifdef NO_UNARY_PLUS
215# define SF_FIX_SHIFT_EOL (0+2)
216# define SF_FL_SHIFT_EOL (0+4)
217#else
218# define SF_FIX_SHIFT_EOL (+2)
219# define SF_FL_SHIFT_EOL (+4)
220#endif
c277df42
IZ
221
222#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
223#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
224
225#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
226#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
227#define SF_IS_INF 0x40
228#define SF_HAS_PAR 0x80
229#define SF_IN_PAR 0x100
230#define SF_HAS_EVAL 0x200
4bfe0158 231#define SCF_DO_SUBSTR 0x400
653099ff
GS
232#define SCF_DO_STCLASS_AND 0x0800
233#define SCF_DO_STCLASS_OR 0x1000
234#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 235#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 */
394
395#define MJD_OFFSET_DEBUG(x)
a3621e74 396/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
397
398
399#define Set_Node_Offset_To_R(node,byte) STMT_START { \
400 if (! SIZE_ONLY) { \
401 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
402 __LINE__, (node), (byte))); \
403 if((node) < 0) { \
551405c4 404 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
405 } else { \
406 RExC_offsets[2*(node)-1] = (byte); \
407 } \
408 } \
409} STMT_END
410
411#define Set_Node_Offset(node,byte) \
412 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
413#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
414
415#define Set_Node_Length_To_R(node,len) STMT_START { \
416 if (! SIZE_ONLY) { \
417 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 418 __LINE__, (int)(node), (int)(len))); \
ccb2c380 419 if((node) < 0) { \
551405c4 420 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
421 } else { \
422 RExC_offsets[2*(node)] = (len); \
423 } \
424 } \
425} STMT_END
426
427#define Set_Node_Length(node,len) \
428 Set_Node_Length_To_R((node)-RExC_emit_start, len)
429#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
430#define Set_Node_Cur_Length(node) \
431 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
432
433/* Get offsets and lengths */
434#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
435#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
436
acfe0abc 437static void clear_re(pTHX_ void *r);
4327152a 438
653099ff
GS
439/* Mark that we cannot extend a found fixed substring at this point.
440 Updata the longest found anchored substring and the longest found
441 floating substrings if needed. */
442
4327152a 443STATIC void
097eb12c 444S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 445{
e1ec3a88
AL
446 const STRLEN l = CHR_SVLEN(data->last_found);
447 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 448
c277df42 449 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 450 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
451 if (*data->longest == data->longest_fixed) {
452 data->offset_fixed = l ? data->last_start_min : data->pos_min;
453 if (data->flags & SF_BEFORE_EOL)
b81d288d 454 data->flags
c277df42
IZ
455 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
456 else
457 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
458 }
459 else {
c277df42 460 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
461 data->offset_float_max = (l
462 ? data->last_start_max
c277df42 463 : data->pos_min + data->pos_delta);
9051bda5
HS
464 if ((U32)data->offset_float_max > (U32)I32_MAX)
465 data->offset_float_max = I32_MAX;
c277df42 466 if (data->flags & SF_BEFORE_EOL)
b81d288d 467 data->flags
c277df42
IZ
468 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
469 else
470 data->flags &= ~SF_FL_BEFORE_EOL;
471 }
472 }
473 SvCUR_set(data->last_found, 0);
0eda9292 474 {
a28509cc 475 SV * const sv = data->last_found;
097eb12c
AL
476 if (SvUTF8(sv) && SvMAGICAL(sv)) {
477 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
478 if (mg)
479 mg->mg_len = 0;
480 }
0eda9292 481 }
c277df42
IZ
482 data->last_end = -1;
483 data->flags &= ~SF_BEFORE_EOL;
484}
485
653099ff
GS
486/* Can match anything (initialization) */
487STATIC void
097eb12c 488S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 489{
653099ff 490 ANYOF_CLASS_ZERO(cl);
f8bef550 491 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 492 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
493 if (LOC)
494 cl->flags |= ANYOF_LOCALE;
495}
496
497/* Can match anything (initialization) */
498STATIC int
5f66b61c 499S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
500{
501 int value;
502
aaa51d5e 503 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
504 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
505 return 1;
1aa99e6b
IH
506 if (!(cl->flags & ANYOF_UNICODE_ALL))
507 return 0;
f8bef550
NC
508 if (!ANYOF_BITMAP_TESTALLSET(cl))
509 return 0;
653099ff
GS
510 return 1;
511}
512
513/* Can match anything (initialization) */
514STATIC void
097eb12c 515S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 516{
8ecf7187 517 Zero(cl, 1, struct regnode_charclass_class);
653099ff 518 cl->type = ANYOF;
830247a4 519 cl_anything(pRExC_state, cl);
653099ff
GS
520}
521
522STATIC void
097eb12c 523S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
8ecf7187 525 Zero(cl, 1, struct regnode_charclass_class);
653099ff 526 cl->type = ANYOF;
830247a4 527 cl_anything(pRExC_state, cl);
653099ff
GS
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* 'And' a given class with another one. Can create false positives */
533/* We assume that cl is not inverted */
534STATIC void
5f66b61c 535S_cl_and(struct regnode_charclass_class *cl,
a28509cc 536 const struct regnode_charclass_class *and_with)
653099ff 537{
653099ff
GS
538 if (!(and_with->flags & ANYOF_CLASS)
539 && !(cl->flags & ANYOF_CLASS)
540 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
541 && !(and_with->flags & ANYOF_FOLD)
542 && !(cl->flags & ANYOF_FOLD)) {
543 int i;
544
545 if (and_with->flags & ANYOF_INVERT)
546 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
547 cl->bitmap[i] &= ~and_with->bitmap[i];
548 else
549 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550 cl->bitmap[i] &= and_with->bitmap[i];
551 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
552 if (!(and_with->flags & ANYOF_EOS))
553 cl->flags &= ~ANYOF_EOS;
1aa99e6b 554
14ebb1a2
JH
555 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
556 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
557 cl->flags &= ~ANYOF_UNICODE_ALL;
558 cl->flags |= ANYOF_UNICODE;
559 ARG_SET(cl, ARG(and_with));
560 }
14ebb1a2
JH
561 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
562 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 563 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
564 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
565 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 566 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
567}
568
569/* 'OR' a given class with another one. Can create false positives */
570/* We assume that cl is not inverted */
571STATIC void
097eb12c 572S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 573{
653099ff
GS
574 if (or_with->flags & ANYOF_INVERT) {
575 /* We do not use
576 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
577 * <= (B1 | !B2) | (CL1 | !CL2)
578 * which is wasteful if CL2 is small, but we ignore CL2:
579 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
580 * XXXX Can we handle case-fold? Unclear:
581 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
582 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
583 */
584 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
585 && !(or_with->flags & ANYOF_FOLD)
586 && !(cl->flags & ANYOF_FOLD) ) {
587 int i;
588
589 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
590 cl->bitmap[i] |= ~or_with->bitmap[i];
591 } /* XXXX: logic is complicated otherwise */
592 else {
830247a4 593 cl_anything(pRExC_state, cl);
653099ff
GS
594 }
595 } else {
596 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
597 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 598 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
599 || (cl->flags & ANYOF_FOLD)) ) {
600 int i;
601
602 /* OR char bitmap and class bitmap separately */
603 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
604 cl->bitmap[i] |= or_with->bitmap[i];
605 if (or_with->flags & ANYOF_CLASS) {
606 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
607 cl->classflags[i] |= or_with->classflags[i];
608 cl->flags |= ANYOF_CLASS;
609 }
610 }
611 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 612 cl_anything(pRExC_state, cl);
653099ff
GS
613 }
614 }
615 if (or_with->flags & ANYOF_EOS)
616 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
617
618 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
619 ARG(cl) != ARG(or_with)) {
620 cl->flags |= ANYOF_UNICODE_ALL;
621 cl->flags &= ~ANYOF_UNICODE;
622 }
623 if (or_with->flags & ANYOF_UNICODE_ALL) {
624 cl->flags |= ANYOF_UNICODE_ALL;
625 cl->flags &= ~ANYOF_UNICODE;
626 }
653099ff
GS
627}
628
5d1c421c 629/*
a3621e74
YO
630
631 make_trie(startbranch,first,last,tail,flags)
632 startbranch: the first branch in the whole branch sequence
633 first : start branch of sequence of branch-exact nodes.
634 May be the same as startbranch
635 last : Thing following the last branch.
636 May be the same as tail.
637 tail : item following the branch sequence
638 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
639
640Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
641
642A trie is an N'ary tree where the branches are determined by digital
643decomposition of the key. IE, at the root node you look up the 1st character and
644follow that branch repeat until you find the end of the branches. Nodes can be
645marked as "accepting" meaning they represent a complete word. Eg:
646
647 /he|she|his|hers/
648
649would convert into the following structure. Numbers represent states, letters
650following numbers represent valid transitions on the letter from that state, if
651the number is in square brackets it represents an accepting state, otherwise it
652will be in parenthesis.
653
654 +-h->+-e->[3]-+-r->(8)-+-s->[9]
655 | |
656 | (2)
657 | |
658 (1) +-i->(6)-+-s->[7]
659 |
660 +-s->(3)-+-h->(4)-+-e->[5]
661
662 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
663
664This shows that when matching against the string 'hers' we will begin at state 1
665read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
666then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
667is also accepting. Thus we know that we can match both 'he' and 'hers' with a
668single traverse. We store a mapping from accepting to state to which word was
669matched, and then when we have multiple possibilities we try to complete the
670rest of the regex in the order in which they occured in the alternation.
671
672The only prior NFA like behaviour that would be changed by the TRIE support is
673the silent ignoring of duplicate alternations which are of the form:
674
675 / (DUPE|DUPE) X? (?{ ... }) Y /x
676
677Thus EVAL blocks follwing a trie may be called a different number of times with
678and without the optimisation. With the optimisations dupes will be silently
679ignored. This inconsistant behaviour of EVAL type nodes is well established as
680the following demonstrates:
681
682 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
683
684which prints out 'word' three times, but
685
686 'words'=~/(word|word|word)(?{ print $1 })S/
687
688which doesnt print it out at all. This is due to other optimisations kicking in.
689
690Example of what happens on a structural level:
691
692The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
693
694 1: CURLYM[1] {1,32767}(18)
695 5: BRANCH(8)
696 6: EXACT <ac>(16)
697 8: BRANCH(11)
698 9: EXACT <ad>(16)
699 11: BRANCH(14)
700 12: EXACT <ab>(16)
701 16: SUCCEED(0)
702 17: NOTHING(18)
703 18: END(0)
704
705This would be optimizable with startbranch=5, first=5, last=16, tail=16
706and should turn into:
707
708 1: CURLYM[1] {1,32767}(18)
709 5: TRIE(16)
710 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
711 <ac>
712 <ad>
713 <ab>
714 16: SUCCEED(0)
715 17: NOTHING(18)
716 18: END(0)
717
718Cases where tail != last would be like /(?foo|bar)baz/:
719
720 1: BRANCH(4)
721 2: EXACT <foo>(8)
722 4: BRANCH(7)
723 5: EXACT <bar>(8)
724 7: TAIL(8)
725 8: EXACT <baz>(10)
726 10: END(0)
727
728which would be optimizable with startbranch=1, first=1, last=7, tail=8
729and would end up looking like:
730
731 1: TRIE(8)
732 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
733 <foo>
734 <bar>
735 7: TAIL(8)
736 8: EXACT <baz>(10)
737 10: END(0)
738
739*/
740
741#define TRIE_DEBUG_CHAR \
742 DEBUG_TRIE_COMPILE_r({ \
743 SV *tmp; \
744 if ( UTF ) { \
6136c704 745 tmp = newSVpvs( "" ); \
a3621e74
YO
746 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
747 } else { \
e4584336 748 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
749 } \
750 av_push( trie->revcharmap, tmp ); \
751 })
752
753#define TRIE_READ_CHAR STMT_START { \
754 if ( UTF ) { \
755 if ( folder ) { \
756 if ( foldlen > 0 ) { \
757 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
758 foldlen -= len; \
759 scan += len; \
760 len = 0; \
761 } else { \
e1ec3a88 762 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
763 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
764 foldlen -= UNISKIP( uvc ); \
765 scan = foldbuf + UNISKIP( uvc ); \
766 } \
767 } else { \
e1ec3a88 768 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
769 } \
770 } else { \
771 uvc = (U32)*uc; \
772 len = 1; \
773 } \
774} STMT_END
775
776
777#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
778#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
779#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
780#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
781
782#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
783 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
784 TRIE_LIST_LEN( state ) *= 2; \
785 Renew( trie->states[ state ].trans.list, \
786 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
787 } \
788 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
789 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
790 TRIE_LIST_CUR( state )++; \
791} STMT_END
792
793#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 794 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
795 4, reg_trie_trans_le ); \
796 TRIE_LIST_CUR( state ) = 1; \
797 TRIE_LIST_LEN( state ) = 4; \
798} STMT_END
799
800STATIC I32
801S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
802{
27da23d5 803 dVAR;
a3621e74
YO
804 /* first pass, loop through and scan words */
805 reg_trie_data *trie;
806 regnode *cur;
9f7f3913 807 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
808 STRLEN len = 0;
809 UV uvc = 0;
810 U16 curword = 0;
811 U32 next_alloc = 0;
812 /* we just use folder as a flag in utf8 */
e1ec3a88 813 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
814 ? PL_fold
815 : ( flags == EXACTFL
816 ? PL_fold_locale
817 : NULL
818 )
819 );
820
e1ec3a88 821 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
822 SV *re_trie_maxbuff;
823
824 GET_RE_DEBUG_FLAGS_DECL;
825
a02a5408 826 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
827 trie->refcount = 1;
828 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 829 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
830 DEBUG_r({
831 trie->words = newAV();
832 trie->revcharmap = newAV();
833 });
834
835
0111c4fd 836 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 837 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 838 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
839 }
840
841 /* -- First loop and Setup --
842
843 We first traverse the branches and scan each word to determine if it
844 contains widechars, and how many unique chars there are, this is
845 important as we have to build a table with at least as many columns as we
846 have unique chars.
847
848 We use an array of integers to represent the character codes 0..255
849 (trie->charmap) and we use a an HV* to store unicode characters. We use the
850 native representation of the character value as the key and IV's for the
851 coded index.
852
853 *TODO* If we keep track of how many times each character is used we can
854 remap the columns so that the table compression later on is more
855 efficient in terms of memory by ensuring most common value is in the
856 middle and the least common are on the outside. IMO this would be better
857 than a most to least common mapping as theres a decent chance the most
858 common letter will share a node with the least common, meaning the node
859 will not be compressable. With a middle is most common approach the worst
860 case is when we have the least common nodes twice.
861
862 */
863
864
865 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 866 regnode * const noper = NEXTOPER( cur );
e1ec3a88 867 const U8 *uc = (U8*)STRING( noper );
a28509cc 868 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
869 STRLEN foldlen = 0;
870 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 871 const U8 *scan = (U8*)NULL;
a3621e74
YO
872
873 for ( ; uc < e ; uc += len ) {
874 trie->charcount++;
875 TRIE_READ_CHAR;
876 if ( uvc < 256 ) {
877 if ( !trie->charmap[ uvc ] ) {
878 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
879 if ( folder )
880 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
881 TRIE_DEBUG_CHAR;
882 }
883 } else {
884 SV** svpp;
885 if ( !trie->widecharmap )
886 trie->widecharmap = newHV();
887
888 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
889
890 if ( !svpp )
e4584336 891 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
892
893 if ( !SvTRUE( *svpp ) ) {
894 sv_setiv( *svpp, ++trie->uniquecharcount );
895 TRIE_DEBUG_CHAR;
896 }
897 }
898 }
899 trie->wordcount++;
900 } /* end first pass */
901 DEBUG_TRIE_COMPILE_r(
902 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
903 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 904 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
905 );
906
907
908 /*
909 We now know what we are dealing with in terms of unique chars and
910 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
911 representation using a flat table will take. If it's over a reasonable
912 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
913 conservative but potentially much slower representation using an array
914 of lists.
915
916 At the end we convert both representations into the same compressed
917 form that will be used in regexec.c for matching with. The latter
918 is a form that cannot be used to construct with but has memory
919 properties similar to the list form and access properties similar
920 to the table form making it both suitable for fast searches and
921 small enough that its feasable to store for the duration of a program.
922
923 See the comment in the code where the compressed table is produced
924 inplace from the flat tabe representation for an explanation of how
925 the compression works.
926
927 */
928
929
930 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
931 /*
932 Second Pass -- Array Of Lists Representation
933
934 Each state will be represented by a list of charid:state records
935 (reg_trie_trans_le) the first such element holds the CUR and LEN
936 points of the allocated array. (See defines above).
937
938 We build the initial structure using the lists, and then convert
939 it into the compressed table form which allows faster lookups
940 (but cant be modified once converted).
941
942
943 */
944
945
946 STRLEN transcount = 1;
947
a02a5408 948 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
949 TRIE_LIST_NEW(1);
950 next_alloc = 2;
951
952 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
953
c445ea15
AL
954 regnode * const noper = NEXTOPER( cur );
955 U8 *uc = (U8*)STRING( noper );
956 const U8 * const e = uc + STR_LEN( noper );
957 U32 state = 1; /* required init */
958 U16 charid = 0; /* sanity init */
959 U8 *scan = (U8*)NULL; /* sanity init */
960 STRLEN foldlen = 0; /* required init */
961 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
962
963 for ( ; uc < e ; uc += len ) {
964
965 TRIE_READ_CHAR;
966
967 if ( uvc < 256 ) {
968 charid = trie->charmap[ uvc ];
969 } else {
970 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
971 if ( !svpp ) {
972 charid = 0;
973 } else {
974 charid=(U16)SvIV( *svpp );
975 }
976 }
977 if ( charid ) {
a3621e74 978
c445ea15
AL
979 U16 check;
980 U32 newstate = 0;
a3621e74 981
c445ea15
AL
982 charid--;
983 if ( !trie->states[ state ].trans.list ) {
984 TRIE_LIST_NEW( state );
985 }
986 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
987 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
988 newstate = TRIE_LIST_ITEM( state, check ).newstate;
989 break;
990 }
991 }
992 if ( ! newstate ) {
993 newstate = next_alloc++;
994 TRIE_LIST_PUSH( state, charid, newstate );
995 transcount++;
996 }
997 state = newstate;
998 } else {
999 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1000 }
c445ea15
AL
1001 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1002 }
a3621e74 1003
c445ea15
AL
1004 if ( !trie->states[ state ].wordnum ) {
1005 /* we havent inserted this word into the structure yet. */
1006 trie->states[ state ].wordnum = ++curword;
a3621e74 1007
c445ea15
AL
1008 DEBUG_r({
1009 /* store the word for dumping */
1010 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1011 if ( UTF ) SvUTF8_on( tmp );
1012 av_push( trie->words, tmp );
1013 });
a3621e74 1014
c445ea15 1015 } else {
bb263b4e 1016 /*EMPTY*/; /* It's a dupe. So ignore it. */
c445ea15 1017 }
a3621e74
YO
1018
1019 } /* end second pass */
1020
1021 trie->laststate = next_alloc;
1022 Renew( trie->states, next_alloc, reg_trie_state );
1023
1024 DEBUG_TRIE_COMPILE_MORE_r({
1025 U32 state;
a3621e74 1026
a28509cc 1027 /* print out the table precompression. */
a3621e74
YO
1028
1029 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1030 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1031
1032 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1033 U16 charid;
a3621e74 1034
e4584336 1035 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1036 if ( ! trie->states[ state ].wordnum ) {
1037 PerlIO_printf( Perl_debug_log, "%5s| ","");
1038 } else {
e4584336 1039 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1040 trie->states[ state ].wordnum
1041 );
1042 }
1043 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1044 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1045 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1046 SvPV_nolen_const( *tmp ),
a3621e74 1047 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1048 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1049 );
1050 }
1051
1052 }
1053 PerlIO_printf( Perl_debug_log, "\n\n" );
1054 });
1055
a02a5408 1056 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1057 {
1058 U32 state;
a3621e74
YO
1059 U32 tp = 0;
1060 U32 zp = 0;
1061
1062
1063 for( state=1 ; state < next_alloc ; state ++ ) {
1064 U32 base=0;
1065
1066 /*
1067 DEBUG_TRIE_COMPILE_MORE_r(
1068 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1069 );
1070 */
1071
1072 if (trie->states[state].trans.list) {
1073 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1074 U16 maxid=minid;
a28509cc 1075 U16 idx;
a3621e74
YO
1076
1077 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1078 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1079 if ( forid < minid ) {
1080 minid=forid;
1081 } else if ( forid > maxid ) {
1082 maxid=forid;
1083 }
a3621e74
YO
1084 }
1085 if ( transcount < tp + maxid - minid + 1) {
1086 transcount *= 2;
1087 Renew( trie->trans, transcount, reg_trie_trans );
1088 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1089 }
1090 base = trie->uniquecharcount + tp - minid;
1091 if ( maxid == minid ) {
1092 U32 set = 0;
1093 for ( ; zp < tp ; zp++ ) {
1094 if ( ! trie->trans[ zp ].next ) {
1095 base = trie->uniquecharcount + zp - minid;
1096 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1097 trie->trans[ zp ].check = state;
1098 set = 1;
1099 break;
1100 }
1101 }
1102 if ( !set ) {
1103 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1104 trie->trans[ tp ].check = state;
1105 tp++;
1106 zp = tp;
1107 }
1108 } else {
1109 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1110 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1111 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1112 trie->trans[ tid ].check = state;
1113 }
1114 tp += ( maxid - minid + 1 );
1115 }
1116 Safefree(trie->states[ state ].trans.list);
1117 }
1118 /*
1119 DEBUG_TRIE_COMPILE_MORE_r(
1120 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1121 );
1122 */
1123 trie->states[ state ].trans.base=base;
1124 }
cc601c31 1125 trie->lasttrans = tp + 1;
a3621e74
YO
1126 }
1127 } else {
1128 /*
1129 Second Pass -- Flat Table Representation.
1130
1131 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1132 We know that we will need Charcount+1 trans at most to store the data
1133 (one row per char at worst case) So we preallocate both structures
1134 assuming worst case.
1135
1136 We then construct the trie using only the .next slots of the entry
1137 structs.
1138
1139 We use the .check field of the first entry of the node temporarily to
1140 make compression both faster and easier by keeping track of how many non
1141 zero fields are in the node.
1142
1143 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1144 transition.
1145
1146 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1147 number representing the first entry of the node, and state as a
1148 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1149 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1150 are 2 entrys per node. eg:
1151
1152 A B A B
1153 1. 2 4 1. 3 7
1154 2. 0 3 3. 0 5
1155 3. 0 0 5. 0 0
1156 4. 0 0 7. 0 0
1157
1158 The table is internally in the right hand, idx form. However as we also
1159 have to deal with the states array which is indexed by nodenum we have to
1160 use TRIE_NODENUM() to convert.
1161
1162 */
1163
a02a5408 1164 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1165 reg_trie_trans );
a02a5408 1166 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1167 next_alloc = trie->uniquecharcount + 1;
1168
1169 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1170
c445ea15 1171 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1172 const U8 *uc = (U8*)STRING( noper );
1173 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1174
1175 U32 state = 1; /* required init */
1176
1177 U16 charid = 0; /* sanity init */
1178 U32 accept_state = 0; /* sanity init */
1179 U8 *scan = (U8*)NULL; /* sanity init */
1180
1181 STRLEN foldlen = 0; /* required init */
1182 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1183
1184
1185 for ( ; uc < e ; uc += len ) {
1186
1187 TRIE_READ_CHAR;
1188
1189 if ( uvc < 256 ) {
1190 charid = trie->charmap[ uvc ];
1191 } else {
c445ea15
AL
1192 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1193 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1194 }
1195 if ( charid ) {
1196 charid--;
1197 if ( !trie->trans[ state + charid ].next ) {
1198 trie->trans[ state + charid ].next = next_alloc;
1199 trie->trans[ state ].check++;
1200 next_alloc += trie->uniquecharcount;
1201 }
1202 state = trie->trans[ state + charid ].next;
1203 } else {
e4584336 1204 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1205 }
1206 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1207 }
1208
1209 accept_state = TRIE_NODENUM( state );
1210 if ( !trie->states[ accept_state ].wordnum ) {
1211 /* we havent inserted this word into the structure yet. */
1212 trie->states[ accept_state ].wordnum = ++curword;
1213
1214 DEBUG_r({
1215 /* store the word for dumping */
1216 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1217 if ( UTF ) SvUTF8_on( tmp );
1218 av_push( trie->words, tmp );
1219 });
1220
1221 } else {
bb263b4e 1222 /*EMPTY*/; /* Its a dupe. So ignore it. */
a3621e74
YO
1223 }
1224
1225 } /* end second pass */
1226
1227 DEBUG_TRIE_COMPILE_MORE_r({
1228 /*
1229 print out the table precompression so that we can do a visual check
1230 that they are identical.
1231 */
1232 U32 state;
1233 U16 charid;
1234 PerlIO_printf( Perl_debug_log, "\nChar : " );
1235
1236 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1237 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1238 if ( tmp ) {
cfd0369c 1239 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1240 }
1241 }
1242
1243 PerlIO_printf( Perl_debug_log, "\nState+-" );
1244
1245 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1246 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1247 }
1248
1249 PerlIO_printf( Perl_debug_log, "\n" );
1250
1251 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1252
e4584336 1253 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1254
1255 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1256 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1257 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1258 }
1259 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1260 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1261 } else {
e4584336 1262 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1263 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1264 }
1265 }
1266 PerlIO_printf( Perl_debug_log, "\n\n" );
1267 });
1268 {
1269 /*
1270 * Inplace compress the table.*
1271
1272 For sparse data sets the table constructed by the trie algorithm will
1273 be mostly 0/FAIL transitions or to put it another way mostly empty.
1274 (Note that leaf nodes will not contain any transitions.)
1275
1276 This algorithm compresses the tables by eliminating most such
1277 transitions, at the cost of a modest bit of extra work during lookup:
1278
1279 - Each states[] entry contains a .base field which indicates the
1280 index in the state[] array wheres its transition data is stored.
1281
1282 - If .base is 0 there are no valid transitions from that node.
1283
1284 - If .base is nonzero then charid is added to it to find an entry in
1285 the trans array.
1286
1287 -If trans[states[state].base+charid].check!=state then the
1288 transition is taken to be a 0/Fail transition. Thus if there are fail
1289 transitions at the front of the node then the .base offset will point
1290 somewhere inside the previous nodes data (or maybe even into a node
1291 even earlier), but the .check field determines if the transition is
1292 valid.
1293
1294 The following process inplace converts the table to the compressed
1295 table: We first do not compress the root node 1,and mark its all its
1296 .check pointers as 1 and set its .base pointer as 1 as well. This
1297 allows to do a DFA construction from the compressed table later, and
1298 ensures that any .base pointers we calculate later are greater than
1299 0.
1300
1301 - We set 'pos' to indicate the first entry of the second node.
1302
1303 - We then iterate over the columns of the node, finding the first and
1304 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1305 and set the .check pointers accordingly, and advance pos
1306 appropriately and repreat for the next node. Note that when we copy
1307 the next pointers we have to convert them from the original
1308 NODEIDX form to NODENUM form as the former is not valid post
1309 compression.
1310
1311 - If a node has no transitions used we mark its base as 0 and do not
1312 advance the pos pointer.
1313
1314 - If a node only has one transition we use a second pointer into the
1315 structure to fill in allocated fail transitions from other states.
1316 This pointer is independent of the main pointer and scans forward
1317 looking for null transitions that are allocated to a state. When it
1318 finds one it writes the single transition into the "hole". If the
1319 pointer doesnt find one the single transition is appeneded as normal.
1320
1321 - Once compressed we can Renew/realloc the structures to release the
1322 excess space.
1323
1324 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1325 specifically Fig 3.47 and the associated pseudocode.
1326
1327 demq
1328 */
a3b680e6 1329 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1330 U32 state, charid;
a3621e74
YO
1331 U32 pos = 0, zp=0;
1332 trie->laststate = laststate;
1333
1334 for ( state = 1 ; state < laststate ; state++ ) {
1335 U8 flag = 0;
a28509cc
AL
1336 const U32 stateidx = TRIE_NODEIDX( state );
1337 const U32 o_used = trie->trans[ stateidx ].check;
1338 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1339 trie->trans[ stateidx ].check = 0;
1340
1341 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1342 if ( flag || trie->trans[ stateidx + charid ].next ) {
1343 if ( trie->trans[ stateidx + charid ].next ) {
1344 if (o_used == 1) {
1345 for ( ; zp < pos ; zp++ ) {
1346 if ( ! trie->trans[ zp ].next ) {
1347 break;
1348 }
1349 }
1350 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1351 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1352 trie->trans[ zp ].check = state;
1353 if ( ++zp > pos ) pos = zp;
1354 break;
1355 }
1356 used--;
1357 }
1358 if ( !flag ) {
1359 flag = 1;
1360 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1361 }
1362 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1363 trie->trans[ pos ].check = state;
1364 pos++;
1365 }
1366 }
1367 }
cc601c31 1368 trie->lasttrans = pos + 1;
a3621e74
YO
1369 Renew( trie->states, laststate + 1, reg_trie_state);
1370 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1371 PerlIO_printf( Perl_debug_log,
1372 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1373 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1374 (IV)next_alloc,
1375 (IV)pos,
a3621e74
YO
1376 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1377 );
1378
1379 } /* end table compress */
1380 }
cc601c31
YO
1381 /* resize the trans array to remove unused space */
1382 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1383
1384 DEBUG_TRIE_COMPILE_r({
1385 U32 state;
1386 /*
1387 Now we print it out again, in a slightly different form as there is additional
1388 info we want to be able to see when its compressed. They are close enough for
1389 visual comparison though.
1390 */
1391 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1392
1393 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1394 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1395 if ( tmp ) {
cfd0369c 1396 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1397 }
1398 }
1399 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1400
a3621e74
YO
1401 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1402 PerlIO_printf( Perl_debug_log, "-----");
1403 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1404
a3621e74 1405 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1406 const U32 base = trie->states[ state ].trans.base;
a3621e74 1407
e4584336 1408 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1409
1410 if ( trie->states[ state ].wordnum ) {
1411 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1412 } else {
1413 PerlIO_printf( Perl_debug_log, "%6s", "" );
1414 }
1415
e4584336 1416 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1417
1418 if ( base ) {
1419 U32 ofs = 0;
1420
cc601c31
YO
1421 while( ( base + ofs < trie->uniquecharcount ) ||
1422 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1423 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1424 ofs++;
1425
e4584336 1426 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1427
1428 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1429 if ( ( base + ofs >= trie->uniquecharcount ) &&
1430 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1431 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1432 {
e4584336
RB
1433 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1434 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1435 } else {
1436 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1437 }
1438 }
1439
e4584336 1440 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1441
1442 }
1443 PerlIO_printf( Perl_debug_log, "\n" );
1444 }
1445 });
1446
1447 {
1448 /* now finally we "stitch in" the new TRIE node
1449 This means we convert either the first branch or the first Exact,
1450 depending on whether the thing following (in 'last') is a branch
1451 or not and whther first is the startbranch (ie is it a sub part of
1452 the alternation or is it the whole thing.)
1453 Assuming its a sub part we conver the EXACT otherwise we convert
1454 the whole branch sequence, including the first.
1455 */
1456 regnode *convert;
1457
1458
1459
1460
1461 if ( first == startbranch && OP( last ) != BRANCH ) {
1462 convert = first;
1463 } else {
1464 convert = NEXTOPER( first );
1465 NEXT_OFF( first ) = (U16)(last - first);
1466 }
1467
1468 OP( convert ) = TRIE + (U8)( flags - EXACT );
1469 NEXT_OFF( convert ) = (U16)(tail - convert);
1470 ARG_SET( convert, data_slot );
1471
1472 /* tells us if we need to handle accept buffers specially */
1473 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1474
1475
1476 /* needed for dumping*/
1477 DEBUG_r({
1478 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1479 /* We now need to mark all of the space originally used by the
1480 branches as optimized away. This keeps the dumpuntil from
1481 throwing a wobbly as it doesnt use regnext() to traverse the
1482 opcodes.
1483 */
1484 while( optimize < last ) {
1485 OP( optimize ) = OPTIMIZED;
1486 optimize++;
1487 }
1488 });
1489 } /* end node insert */
1490 return 1;
1491}
1492
1493
1494
1495/*
5d1c421c
JH
1496 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1497 * These need to be revisited when a newer toolchain becomes available.
1498 */
1499#if defined(__sparc64__) && defined(__GNUC__)
1500# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1501# undef SPARC64_GCC_WORKAROUND
1502# define SPARC64_GCC_WORKAROUND 1
1503# endif
1504#endif
1505
653099ff
GS
1506/* REx optimizer. Converts nodes into quickier variants "in place".
1507 Finds fixed substrings. */
1508
a0288114 1509/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1510 to the position after last scanned or to NULL. */
1511
a3621e74 1512
76e3520e 1513STATIC I32
9a957fbc
AL
1514S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1515 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1516 /* scanp: Start here (read-write). */
1517 /* deltap: Write maxlen-minlen here. */
1518 /* last: Stop before this one. */
1519{
97aff369 1520 dVAR;
c277df42
IZ
1521 I32 min = 0, pars = 0, code;
1522 regnode *scan = *scanp, *next;
1523 I32 delta = 0;
1524 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1525 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1526 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1527 scan_data_t data_fake;
653099ff 1528 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1529 SV *re_trie_maxbuff = NULL;
1530
1531 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1532
c277df42
IZ
1533 while (scan && OP(scan) != END && scan < last) {
1534 /* Peephole optimizer: */
a3621e74 1535 DEBUG_OPTIMISE_r({
c445ea15 1536 SV * const mysv=sv_newmortal();
32fc9b6a 1537 regprop(RExC_rx, mysv, scan);
e4584336 1538 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1539 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1540 });
c277df42 1541
22c35a8c 1542 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1543 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1544 regnode *n = regnext(scan);
1545 U32 stringok = 1;
1546#ifdef DEBUGGING
1547 regnode *stop = scan;
b81d288d 1548#endif
c277df42 1549
cd439c50 1550 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1551 /* Skip NOTHING, merge EXACT*. */
1552 while (n &&
b81d288d 1553 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1554 (stringok && (OP(n) == OP(scan))))
1555 && NEXT_OFF(n)
1556 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1557 if (OP(n) == TAIL || n > next)
1558 stringok = 0;
22c35a8c 1559 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1560 NEXT_OFF(scan) += NEXT_OFF(n);
1561 next = n + NODE_STEP_REGNODE;
1562#ifdef DEBUGGING
1563 if (stringok)
1564 stop = n;
b81d288d 1565#endif
c277df42 1566 n = regnext(n);
a0ed51b3 1567 }
f49d4d0f 1568 else if (stringok) {
a3b680e6 1569 const int oldl = STR_LEN(scan);
c445ea15 1570 regnode * const nnext = regnext(n);
f49d4d0f 1571
b81d288d 1572 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1573 break;
1574 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1575 STR_LEN(scan) += STR_LEN(n);
1576 next = n + NODE_SZ_STR(n);
c277df42 1577 /* Now we can overwrite *n : */
f49d4d0f 1578 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1579#ifdef DEBUGGING
f49d4d0f 1580 stop = next - 1;
b81d288d 1581#endif
c277df42
IZ
1582 n = nnext;
1583 }
1584 }
61a36c01 1585
a3621e74 1586 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1587/*
1588 Two problematic code points in Unicode casefolding of EXACT nodes:
1589
1590 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1591 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1592
1593 which casefold to
1594
1595 Unicode UTF-8
1596
1597 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1598 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1599
1600 This means that in case-insensitive matching (or "loose matching",
1601 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1602 length of the above casefolded versions) can match a target string
1603 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1604 This would rather mess up the minimum length computation.
1605
1606 What we'll do is to look for the tail four bytes, and then peek
1607 at the preceding two bytes to see whether we need to decrease
1608 the minimum length by four (six minus two).
1609
1610 Thanks to the design of UTF-8, there cannot be false matches:
1611 A sequence of valid UTF-8 bytes cannot be a subsequence of
1612 another valid sequence of UTF-8 bytes.
1613
1614*/
c445ea15
AL
1615 char * const s0 = STRING(scan), *s, *t;
1616 char * const s1 = s0 + STR_LEN(scan) - 1;
1617 char * const s2 = s1 - 4;
d4c19fe8 1618 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1619 const char * const t1 = t0 + 3;
2af232bd 1620
61a36c01
JH
1621 for (s = s0 + 2;
1622 s < s2 && (t = ninstr(s, s1, t0, t1));
1623 s = t + 4) {
1624 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1625 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1626 min -= 4;
1627 }
1628 }
1629
c277df42
IZ
1630#ifdef DEBUGGING
1631 /* Allow dumping */
cd439c50 1632 n = scan + NODE_SZ_STR(scan);
c277df42 1633 while (n <= stop) {
22c35a8c 1634 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1635 OP(n) = OPTIMIZED;
1636 NEXT_OFF(n) = 0;
1637 }
1638 n++;
1639 }
653099ff 1640#endif
c277df42 1641 }
a3621e74
YO
1642
1643
1644
653099ff
GS
1645 /* Follow the next-chain of the current node and optimize
1646 away all the NOTHINGs from it. */
c277df42 1647 if (OP(scan) != CURLYX) {
a3b680e6 1648 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1649 ? I32_MAX
1650 /* I32 may be smaller than U16 on CRAYs! */
1651 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1652 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1653 int noff;
1654 regnode *n = scan;
b81d288d 1655
c277df42
IZ
1656 /* Skip NOTHING and LONGJMP. */
1657 while ((n = regnext(n))
22c35a8c 1658 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1659 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1660 && off + noff < max)
1661 off += noff;
1662 if (reg_off_by_arg[OP(scan)])
1663 ARG(scan) = off;
b81d288d 1664 else
c277df42
IZ
1665 NEXT_OFF(scan) = off;
1666 }
a3621e74 1667
653099ff
GS
1668 /* The principal pseudo-switch. Cannot be a switch, since we
1669 look into several different things. */
b81d288d 1670 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1671 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1672 next = regnext(scan);
1673 code = OP(scan);
a3621e74 1674 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1675
1676 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1677 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1678 struct regnode_charclass_class accum;
d4c19fe8 1679 regnode * const startbranch=scan;
c277df42 1680
653099ff 1681 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1682 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1683 if (flags & SCF_DO_STCLASS)
830247a4 1684 cl_init_zero(pRExC_state, &accum);
a3621e74 1685
c277df42 1686 while (OP(scan) == code) {
830247a4 1687 I32 deltanext, minnext, f = 0, fake;
653099ff 1688 struct regnode_charclass_class this_class;
c277df42
IZ
1689
1690 num++;
1691 data_fake.flags = 0;
b81d288d 1692 if (data) {
2c2d71f5 1693 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1694 data_fake.last_closep = data->last_closep;
1695 }
1696 else
1697 data_fake.last_closep = &fake;
c277df42
IZ
1698 next = regnext(scan);
1699 scan = NEXTOPER(scan);
1700 if (code != BRANCH)
1701 scan = NEXTOPER(scan);
653099ff 1702 if (flags & SCF_DO_STCLASS) {
830247a4 1703 cl_init(pRExC_state, &this_class);
653099ff
GS
1704 data_fake.start_class = &this_class;
1705 f = SCF_DO_STCLASS_AND;
b81d288d 1706 }
e1901655
IZ
1707 if (flags & SCF_WHILEM_VISITED_POS)
1708 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1709
653099ff 1710 /* we suppose the run is continuous, last=next...*/
830247a4 1711 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1712 next, &data_fake, f,depth+1);
b81d288d 1713 if (min1 > minnext)
c277df42
IZ
1714 min1 = minnext;
1715 if (max1 < minnext + deltanext)
1716 max1 = minnext + deltanext;
1717 if (deltanext == I32_MAX)
aca2d497 1718 is_inf = is_inf_internal = 1;
c277df42
IZ
1719 scan = next;
1720 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1721 pars++;
405ff068 1722 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1723 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1724 if (data)
1725 data->whilem_c = data_fake.whilem_c;
653099ff 1726 if (flags & SCF_DO_STCLASS)
830247a4 1727 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1728 if (code == SUSPEND)
c277df42
IZ
1729 break;
1730 }
1731 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1732 min1 = 0;
1733 if (flags & SCF_DO_SUBSTR) {
1734 data->pos_min += min1;
1735 data->pos_delta += max1 - min1;
1736 if (max1 != min1 || is_inf)
1737 data->longest = &(data->longest_float);
1738 }
1739 min += min1;
1740 delta += max1 - min1;
653099ff 1741 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1742 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1743 if (min1) {
1744 cl_and(data->start_class, &and_with);
1745 flags &= ~SCF_DO_STCLASS;
1746 }
1747 }
1748 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1749 if (min1) {
1750 cl_and(data->start_class, &accum);
653099ff 1751 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1752 }
1753 else {
b81d288d 1754 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1755 * data->start_class */
1756 StructCopy(data->start_class, &and_with,
1757 struct regnode_charclass_class);
1758 flags &= ~SCF_DO_STCLASS_AND;
1759 StructCopy(&accum, data->start_class,
1760 struct regnode_charclass_class);
1761 flags |= SCF_DO_STCLASS_OR;
1762 data->start_class->flags |= ANYOF_EOS;
1763 }
653099ff 1764 }
a3621e74
YO
1765
1766 /* demq.
1767
1768 Assuming this was/is a branch we are dealing with: 'scan' now
1769 points at the item that follows the branch sequence, whatever
1770 it is. We now start at the beginning of the sequence and look
1771 for subsequences of
1772
1773 BRANCH->EXACT=>X
1774 BRANCH->EXACT=>X
1775
1776 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1777
1778 If we can find such a subseqence we need to turn the first
1779 element into a trie and then add the subsequent branch exact
1780 strings to the trie.
1781
1782 We have two cases
1783
1784 1. patterns where the whole set of branch can be converted to a trie,
1785
1786 2. patterns where only a subset of the alternations can be
1787 converted to a trie.
1788
1789 In case 1 we can replace the whole set with a single regop
1790 for the trie. In case 2 we need to keep the start and end
1791 branchs so
1792
1793 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1794 becomes BRANCH TRIE; BRANCH X;
1795
1796 Hypthetically when we know the regex isnt anchored we can
1797 turn a case 1 into a DFA and let it rip... Every time it finds a match
1798 it would just call its tail, no WHILEM/CURLY needed.
1799
1800 */
0111c4fd
RGS
1801 if (DO_TRIE) {
1802 if (!re_trie_maxbuff) {
1803 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1804 if (!SvIOK(re_trie_maxbuff))
1805 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1806 }
a3621e74
YO
1807 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1808 regnode *cur;
1809 regnode *first = (regnode *)NULL;
1810 regnode *last = (regnode *)NULL;
1811 regnode *tail = scan;
1812 U8 optype = 0;
1813 U32 count=0;
1814
1815#ifdef DEBUGGING
c445ea15 1816 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1817#endif
1818 /* var tail is used because there may be a TAIL
1819 regop in the way. Ie, the exacts will point to the
1820 thing following the TAIL, but the last branch will
1821 point at the TAIL. So we advance tail. If we
1822 have nested (?:) we may have to move through several
1823 tails.
1824 */
1825
1826 while ( OP( tail ) == TAIL ) {
1827 /* this is the TAIL generated by (?:) */
1828 tail = regnext( tail );
1829 }
1830
1831 DEBUG_OPTIMISE_r({
32fc9b6a 1832 regprop(RExC_rx, mysv, tail );
a3621e74 1833 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1834 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1835 (RExC_seen_evals) ? "[EVAL]" : ""
1836 );
1837 });
1838 /*
1839
1840 step through the branches, cur represents each
1841 branch, noper is the first thing to be matched
1842 as part of that branch and noper_next is the
1843 regnext() of that node. if noper is an EXACT
1844 and noper_next is the same as scan (our current
1845 position in the regex) then the EXACT branch is
1846 a possible optimization target. Once we have
1847 two or more consequetive such branches we can
1848 create a trie of the EXACT's contents and stich
1849 it in place. If the sequence represents all of
1850 the branches we eliminate the whole thing and
1851 replace it with a single TRIE. If it is a
1852 subsequence then we need to stitch it in. This
1853 means the first branch has to remain, and needs
1854 to be repointed at the item on the branch chain
1855 following the last branch optimized. This could
1856 be either a BRANCH, in which case the
1857 subsequence is internal, or it could be the
1858 item following the branch sequence in which
1859 case the subsequence is at the end.
1860
1861 */
1862
1863 /* dont use tail as the end marker for this traverse */
1864 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1865 regnode * const noper = NEXTOPER( cur );
1866 regnode * const noper_next = regnext( noper );
a3621e74 1867
a3621e74 1868 DEBUG_OPTIMISE_r({
32fc9b6a 1869 regprop(RExC_rx, mysv, cur);
a3621e74 1870 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1871 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74 1872
32fc9b6a 1873 regprop(RExC_rx, mysv, noper);
a3621e74 1874 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1875 SvPV_nolen_const(mysv));
a3621e74
YO
1876
1877 if ( noper_next ) {
32fc9b6a 1878 regprop(RExC_rx, mysv, noper_next );
a3621e74 1879 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1880 SvPV_nolen_const(mysv));
a3621e74
YO
1881 }
1882 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1883 first, last, cur );
1884 });
1885 if ( ( first ? OP( noper ) == optype
1886 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1887 && noper_next == tail && count<U16_MAX)
1888 {
1889 count++;
1890 if ( !first ) {
1891 first = cur;
1892 optype = OP( noper );
1893 } else {
1894 DEBUG_OPTIMISE_r(
1895 if (!last ) {
32fc9b6a 1896 regprop(RExC_rx, mysv, first);
a3621e74 1897 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1898 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
32fc9b6a 1899 regprop(RExC_rx, mysv, NEXTOPER(first) );
a3621e74 1900 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1901 SvPV_nolen_const( mysv ) );
a3621e74
YO
1902 }
1903 );
1904 last = cur;
1905 DEBUG_OPTIMISE_r({
32fc9b6a 1906 regprop(RExC_rx, mysv, cur);
a3621e74 1907 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1908 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
32fc9b6a 1909 regprop(RExC_rx, mysv, noper );
a3621e74 1910 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1911 SvPV_nolen_const( mysv ) );
a3621e74
YO
1912 });
1913 }
1914 } else {
1915 if ( last ) {
1916 DEBUG_OPTIMISE_r(
1917 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1918 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1919 );
1920 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1921 }
1922 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1923 && noper_next == tail )
1924 {
1925 count = 1;
1926 first = cur;
1927 optype = OP( noper );
1928 } else {
1929 count = 0;
1930 first = NULL;
1931 optype = 0;
1932 }
1933 last = NULL;
1934 }
1935 }
1936 DEBUG_OPTIMISE_r({
32fc9b6a 1937 regprop(RExC_rx, mysv, cur);
a3621e74 1938 PerlIO_printf( Perl_debug_log,
e4584336 1939 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1940 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1941
1942 });
1943 if ( last ) {
1944 DEBUG_OPTIMISE_r(
1945 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1946 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1947 );
1948 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1949 }
1950 }
1951 }
a0ed51b3 1952 }
a3621e74 1953 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1954 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1955 } else /* single branch is optimized. */
c277df42
IZ
1956 scan = NEXTOPER(scan);
1957 continue;
a0ed51b3
LW
1958 }
1959 else if (OP(scan) == EXACT) {
cd439c50 1960 I32 l = STR_LEN(scan);
c445ea15 1961 UV uc;
a0ed51b3 1962 if (UTF) {
a3b680e6 1963 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1964 l = utf8_length(s, s + l);
9041c2e3 1965 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1966 } else {
1967 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1968 }
1969 min += l;
c277df42 1970 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1971 /* The code below prefers earlier match for fixed
1972 offset, later match for variable offset. */
1973 if (data->last_end == -1) { /* Update the start info. */
1974 data->last_start_min = data->pos_min;
1975 data->last_start_max = is_inf
b81d288d 1976 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1977 }
cd439c50 1978 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
1979 if (UTF)
1980 SvUTF8_on(data->last_found);
0eda9292 1981 {
9a957fbc 1982 SV * const sv = data->last_found;
a28509cc 1983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
1984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1985 if (mg && mg->mg_len >= 0)
5e43f467
JH
1986 mg->mg_len += utf8_length((U8*)STRING(scan),
1987 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 1988 }
c277df42
IZ
1989 data->last_end = data->pos_min + l;
1990 data->pos_min += l; /* As in the first entry. */
1991 data->flags &= ~SF_BEFORE_EOL;
1992 }
653099ff
GS
1993 if (flags & SCF_DO_STCLASS_AND) {
1994 /* Check whether it is compatible with what we know already! */
1995 int compat = 1;
1996
1aa99e6b 1997 if (uc >= 0x100 ||
516a5887 1998 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 1999 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2000 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2001 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2002 )
653099ff
GS
2003 compat = 0;
2004 ANYOF_CLASS_ZERO(data->start_class);
2005 ANYOF_BITMAP_ZERO(data->start_class);
2006 if (compat)
1aa99e6b 2007 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2008 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2009 if (uc < 0x100)
2010 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2011 }
2012 else if (flags & SCF_DO_STCLASS_OR) {
2013 /* false positive possible if the class is case-folded */
1aa99e6b 2014 if (uc < 0x100)
9b877dbb
IH
2015 ANYOF_BITMAP_SET(data->start_class, uc);
2016 else
2017 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2018 data->start_class->flags &= ~ANYOF_EOS;
2019 cl_and(data->start_class, &and_with);
2020 }
2021 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2022 }
653099ff 2023 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2024 I32 l = STR_LEN(scan);
1aa99e6b 2025 UV uc = *((U8*)STRING(scan));
653099ff
GS
2026
2027 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2028 if (flags & SCF_DO_SUBSTR) {
2029 assert(data);
830247a4 2030 scan_commit(pRExC_state, data);
ecaa9b9c 2031 }
a0ed51b3 2032 if (UTF) {
6136c704 2033 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2034 l = utf8_length(s, s + l);
9041c2e3 2035 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2036 }
2037 min += l;
ecaa9b9c 2038 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2039 data->pos_min += l;
653099ff
GS
2040 if (flags & SCF_DO_STCLASS_AND) {
2041 /* Check whether it is compatible with what we know already! */
2042 int compat = 1;
2043
1aa99e6b 2044 if (uc >= 0x100 ||
516a5887 2045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2048 compat = 0;
2049 ANYOF_CLASS_ZERO(data->start_class);
2050 ANYOF_BITMAP_ZERO(data->start_class);
2051 if (compat) {
1aa99e6b 2052 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2053 data->start_class->flags &= ~ANYOF_EOS;
2054 data->start_class->flags |= ANYOF_FOLD;
2055 if (OP(scan) == EXACTFL)
2056 data->start_class->flags |= ANYOF_LOCALE;
2057 }
2058 }
2059 else if (flags & SCF_DO_STCLASS_OR) {
2060 if (data->start_class->flags & ANYOF_FOLD) {
2061 /* false positive possible if the class is case-folded.
2062 Assume that the locale settings are the same... */
1aa99e6b
IH
2063 if (uc < 0x100)
2064 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2065 data->start_class->flags &= ~ANYOF_EOS;
2066 }
2067 cl_and(data->start_class, &and_with);
2068 }
2069 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2070 }
bfed75c6 2071 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2073 I32 f = flags, pos_before = 0;
d4c19fe8 2074 regnode * const oscan = scan;
653099ff
GS
2075 struct regnode_charclass_class this_class;
2076 struct regnode_charclass_class *oclass = NULL;
727f22e3 2077 I32 next_is_eval = 0;
653099ff 2078
22c35a8c 2079 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2080 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2081 scan = NEXTOPER(scan);
2082 goto finish;
2083 case PLUS:
653099ff 2084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2085 next = NEXTOPER(scan);
653099ff 2086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2087 mincount = 1;
2088 maxcount = REG_INFTY;
c277df42
IZ
2089 next = regnext(scan);
2090 scan = NEXTOPER(scan);
2091 goto do_curly;
2092 }
2093 }
2094 if (flags & SCF_DO_SUBSTR)
2095 data->pos_min++;
2096 min++;
2097 /* Fall through. */
2098 case STAR:
653099ff
GS
2099 if (flags & SCF_DO_STCLASS) {
2100 mincount = 0;
b81d288d 2101 maxcount = REG_INFTY;
653099ff
GS
2102 next = regnext(scan);
2103 scan = NEXTOPER(scan);
2104 goto do_curly;
2105 }
b81d288d 2106 is_inf = is_inf_internal = 1;
c277df42
IZ
2107 scan = regnext(scan);
2108 if (flags & SCF_DO_SUBSTR) {
830247a4 2109 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2110 data->longest = &(data->longest_float);
2111 }
2112 goto optimize_curly_tail;
2113 case CURLY:
b81d288d 2114 mincount = ARG1(scan);
c277df42
IZ
2115 maxcount = ARG2(scan);
2116 next = regnext(scan);
cb434fcc
IZ
2117 if (OP(scan) == CURLYX) {
2118 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2119 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2120 }
c277df42 2121 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2122 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2123 do_curly:
2124 if (flags & SCF_DO_SUBSTR) {
830247a4 2125 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2126 pos_before = data->pos_min;
2127 }
2128 if (data) {
2129 fl = data->flags;
2130 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2131 if (is_inf)
2132 data->flags |= SF_IS_INF;
2133 }
653099ff 2134 if (flags & SCF_DO_STCLASS) {
830247a4 2135 cl_init(pRExC_state, &this_class);
653099ff
GS
2136 oclass = data->start_class;
2137 data->start_class = &this_class;
2138 f |= SCF_DO_STCLASS_AND;
2139 f &= ~SCF_DO_STCLASS_OR;
2140 }
e1901655
IZ
2141 /* These are the cases when once a subexpression
2142 fails at a particular position, it cannot succeed
2143 even after backtracking at the enclosing scope.
b81d288d 2144
e1901655
IZ
2145 XXXX what if minimal match and we are at the
2146 initial run of {n,m}? */
2147 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2148 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2149
c277df42 2150 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2151 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2152 (mincount == 0
2153 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2154
2155 if (flags & SCF_DO_STCLASS)
2156 data->start_class = oclass;
2157 if (mincount == 0 || minnext == 0) {
2158 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2159 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2160 }
2161 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2162 /* Switch to OR mode: cache the old value of
653099ff
GS
2163 * data->start_class */
2164 StructCopy(data->start_class, &and_with,
2165 struct regnode_charclass_class);
2166 flags &= ~SCF_DO_STCLASS_AND;
2167 StructCopy(&this_class, data->start_class,
2168 struct regnode_charclass_class);
2169 flags |= SCF_DO_STCLASS_OR;
2170 data->start_class->flags |= ANYOF_EOS;
2171 }
2172 } else { /* Non-zero len */
2173 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2174 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2175 cl_and(data->start_class, &and_with);
2176 }
2177 else if (flags & SCF_DO_STCLASS_AND)
2178 cl_and(data->start_class, &this_class);
2179 flags &= ~SCF_DO_STCLASS;
2180 }
c277df42
IZ
2181 if (!scan) /* It was not CURLYX, but CURLY. */
2182 scan = next;
041457d9
DM
2183 if ( /* ? quantifier ok, except for (?{ ... }) */
2184 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2185 && (minnext == 0) && (deltanext == 0)
99799961 2186 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2187 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2188 && ckWARN(WARN_REGEXP))
b45f050a 2189 {
830247a4 2190 vWARN(RExC_parse,
b45f050a
JF
2191 "Quantifier unexpected on zero-length expression");
2192 }
2193
c277df42 2194 min += minnext * mincount;
b81d288d 2195 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2196 && (minnext + deltanext) > 0)
2197 || deltanext == I32_MAX);
aca2d497 2198 is_inf |= is_inf_internal;
c277df42
IZ
2199 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2200
2201 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2202 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2203 && data->flags & SF_IN_PAR
2204 && !(data->flags & SF_HAS_EVAL)
2205 && !deltanext && minnext == 1 ) {
2206 /* Try to optimize to CURLYN. */
2207 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2208 regnode * const nxt1 = nxt;
497b47a8
JH
2209#ifdef DEBUGGING
2210 regnode *nxt2;
2211#endif
c277df42
IZ
2212
2213 /* Skip open. */
2214 nxt = regnext(nxt);
bfed75c6 2215 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2216 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2217 && STR_LEN(nxt) == 1))
c277df42 2218 goto nogo;
497b47a8 2219#ifdef DEBUGGING
c277df42 2220 nxt2 = nxt;
497b47a8 2221#endif
c277df42 2222 nxt = regnext(nxt);
b81d288d 2223 if (OP(nxt) != CLOSE)
c277df42
IZ
2224 goto nogo;
2225 /* Now we know that nxt2 is the only contents: */
eb160463 2226 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2227 OP(oscan) = CURLYN;
2228 OP(nxt1) = NOTHING; /* was OPEN. */
2229#ifdef DEBUGGING
2230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2234 OP(nxt + 1) = OPTIMIZED; /* was count. */
2235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2236#endif
c277df42 2237 }
c277df42
IZ
2238 nogo:
2239
2240 /* Try optimization CURLYX => CURLYM. */
b81d288d 2241 if ( OP(oscan) == CURLYX && data
c277df42 2242 && !(data->flags & SF_HAS_PAR)
c277df42 2243 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2244 && !deltanext /* atom is fixed width */
2245 && minnext != 0 /* CURLYM can't handle zero width */
2246 ) {
c277df42
IZ
2247 /* XXXX How to optimize if data == 0? */
2248 /* Optimize to a simpler form. */
2249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2250 regnode *nxt2;
2251
2252 OP(oscan) = CURLYM;
2253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2254 && (OP(nxt2) != WHILEM))
c277df42
IZ
2255 nxt = nxt2;
2256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2257 /* Need to optimize away parenths. */
2258 if (data->flags & SF_IN_PAR) {
2259 /* Set the parenth number. */
2260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2261
b81d288d 2262 if (OP(nxt) != CLOSE)
b45f050a 2263 FAIL("Panic opt close");
eb160463 2264 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2265 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2266 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2267#ifdef DEBUGGING
2268 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2271 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2272#endif
c277df42
IZ
2273#if 0
2274 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2275 regnode *nnxt = regnext(nxt1);
b81d288d 2276
c277df42
IZ
2277 if (nnxt == nxt) {
2278 if (reg_off_by_arg[OP(nxt1)])
2279 ARG_SET(nxt1, nxt2 - nxt1);
2280 else if (nxt2 - nxt1 < U16_MAX)
2281 NEXT_OFF(nxt1) = nxt2 - nxt1;
2282 else
2283 OP(nxt) = NOTHING; /* Cannot beautify */
2284 }
2285 nxt1 = nnxt;
2286 }
2287#endif
2288 /* Optimize again: */
b81d288d 2289 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2290 NULL, 0,depth+1);
a0ed51b3
LW
2291 }
2292 else
c277df42 2293 oscan->flags = 0;
c277df42 2294 }
e1901655
IZ
2295 else if ((OP(oscan) == CURLYX)
2296 && (flags & SCF_WHILEM_VISITED_POS)
2297 /* See the comment on a similar expression above.
2298 However, this time it not a subexpression
2299 we care about, but the expression itself. */
2300 && (maxcount == REG_INFTY)
2301 && data && ++data->whilem_c < 16) {
2302 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2303 /* Find WHILEM (as in regexec.c) */
2304 regnode *nxt = oscan + NEXT_OFF(oscan);
2305
2306 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2307 nxt += ARG(nxt);
eb160463
GS
2308 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2309 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2310 }
b81d288d 2311 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2312 pars++;
2313 if (flags & SCF_DO_SUBSTR) {
c445ea15 2314 SV *last_str = NULL;
c277df42
IZ
2315 int counted = mincount != 0;
2316
2317 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2318#if defined(SPARC64_GCC_WORKAROUND)
2319 I32 b = 0;
2320 STRLEN l = 0;
cfd0369c 2321 const char *s = NULL;
5d1c421c
JH
2322 I32 old = 0;
2323
2324 if (pos_before >= data->last_start_min)
2325 b = pos_before;
2326 else
2327 b = data->last_start_min;
2328
2329 l = 0;
cfd0369c 2330 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2331 old = b - data->last_start_min;
2332
2333#else
b81d288d 2334 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2335 ? pos_before : data->last_start_min;
2336 STRLEN l;
d4c19fe8 2337 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2338 I32 old = b - data->last_start_min;
5d1c421c 2339#endif
a0ed51b3
LW
2340
2341 if (UTF)
2342 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2343
a0ed51b3 2344 l -= old;
c277df42 2345 /* Get the added string: */
79cb57f6 2346 last_str = newSVpvn(s + old, l);
0e933229
IH
2347 if (UTF)
2348 SvUTF8_on(last_str);
c277df42
IZ
2349 if (deltanext == 0 && pos_before == b) {
2350 /* What was added is a constant string */
2351 if (mincount > 1) {
2352 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2353 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2354 SvPVX_const(last_str), l, mincount - 1);
b162af07 2355 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2356 /* Add additional parts. */
b81d288d 2357 SvCUR_set(data->last_found,
c277df42
IZ
2358 SvCUR(data->last_found) - l);
2359 sv_catsv(data->last_found, last_str);
0eda9292
JH
2360 {
2361 SV * sv = data->last_found;
2362 MAGIC *mg =
2363 SvUTF8(sv) && SvMAGICAL(sv) ?
2364 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2365 if (mg && mg->mg_len >= 0)
2366 mg->mg_len += CHR_SVLEN(last_str);
2367 }
c277df42
IZ
2368 data->last_end += l * (mincount - 1);
2369 }
2a8d9689
HS
2370 } else {
2371 /* start offset must point into the last copy */
2372 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2373 data->last_start_max += is_inf ? I32_MAX
2374 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2375 }
2376 }
2377 /* It is counted once already... */
2378 data->pos_min += minnext * (mincount - counted);
2379 data->pos_delta += - counted * deltanext +
2380 (minnext + deltanext) * maxcount - minnext * mincount;
2381 if (mincount != maxcount) {
653099ff
GS
2382 /* Cannot extend fixed substrings found inside
2383 the group. */
830247a4 2384 scan_commit(pRExC_state,data);
c277df42 2385 if (mincount && last_str) {
d4c19fe8
AL
2386 SV * const sv = data->last_found;
2387 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2388 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2389
2390 if (mg)
2391 mg->mg_len = -1;
2392 sv_setsv(sv, last_str);
c277df42 2393 data->last_end = data->pos_min;
b81d288d 2394 data->last_start_min =
a0ed51b3 2395 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2396 data->last_start_max = is_inf
2397 ? I32_MAX
c277df42 2398 : data->pos_min + data->pos_delta
a0ed51b3 2399 - CHR_SVLEN(last_str);
c277df42
IZ
2400 }
2401 data->longest = &(data->longest_float);
2402 }
aca2d497 2403 SvREFCNT_dec(last_str);
c277df42 2404 }
405ff068 2405 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2406 data->flags |= SF_HAS_EVAL;
2407 optimize_curly_tail:
c277df42 2408 if (OP(oscan) != CURLYX) {
22c35a8c 2409 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2410 && NEXT_OFF(next))
2411 NEXT_OFF(oscan) += NEXT_OFF(next);
2412 }
c277df42 2413 continue;
653099ff 2414 default: /* REF and CLUMP only? */
c277df42 2415 if (flags & SCF_DO_SUBSTR) {
830247a4 2416 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2417 data->longest = &(data->longest_float);
2418 }
aca2d497 2419 is_inf = is_inf_internal = 1;
653099ff 2420 if (flags & SCF_DO_STCLASS_OR)
830247a4 2421 cl_anything(pRExC_state, data->start_class);
653099ff 2422 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2423 break;
2424 }
a0ed51b3 2425 }
bfed75c6 2426 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2427 int value = 0;
653099ff 2428
c277df42 2429 if (flags & SCF_DO_SUBSTR) {
830247a4 2430 scan_commit(pRExC_state,data);
c277df42
IZ
2431 data->pos_min++;
2432 }
2433 min++;
653099ff
GS
2434 if (flags & SCF_DO_STCLASS) {
2435 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2436
2437 /* Some of the logic below assumes that switching
2438 locale on will only add false positives. */
2439 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2440 case SANY:
653099ff
GS
2441 default:
2442 do_default:
2443 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2444 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2445 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2446 break;
2447 case REG_ANY:
2448 if (OP(scan) == SANY)
2449 goto do_default;
2450 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2451 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2452 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2453 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2454 }
2455 if (flags & SCF_DO_STCLASS_AND || !value)
2456 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2457 break;
2458 case ANYOF:
2459 if (flags & SCF_DO_STCLASS_AND)
2460 cl_and(data->start_class,
2461 (struct regnode_charclass_class*)scan);
2462 else
830247a4 2463 cl_or(pRExC_state, data->start_class,
653099ff
GS
2464 (struct regnode_charclass_class*)scan);
2465 break;
2466 case ALNUM:
2467 if (flags & SCF_DO_STCLASS_AND) {
2468 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2469 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2470 for (value = 0; value < 256; value++)
2471 if (!isALNUM(value))
2472 ANYOF_BITMAP_CLEAR(data->start_class, value);
2473 }
2474 }
2475 else {
2476 if (data->start_class->flags & ANYOF_LOCALE)
2477 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2478 else {
2479 for (value = 0; value < 256; value++)
2480 if (isALNUM(value))
b81d288d 2481 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2482 }
2483 }
2484 break;
2485 case ALNUML:
2486 if (flags & SCF_DO_STCLASS_AND) {
2487 if (data->start_class->flags & ANYOF_LOCALE)
2488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2489 }
2490 else {
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2492 data->start_class->flags |= ANYOF_LOCALE;
2493 }
2494 break;
2495 case NALNUM:
2496 if (flags & SCF_DO_STCLASS_AND) {
2497 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2499 for (value = 0; value < 256; value++)
2500 if (isALNUM(value))
2501 ANYOF_BITMAP_CLEAR(data->start_class, value);
2502 }
2503 }
2504 else {
2505 if (data->start_class->flags & ANYOF_LOCALE)
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2507 else {
2508 for (value = 0; value < 256; value++)
2509 if (!isALNUM(value))
b81d288d 2510 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2511 }
2512 }
2513 break;
2514 case NALNUML:
2515 if (flags & SCF_DO_STCLASS_AND) {
2516 if (data->start_class->flags & ANYOF_LOCALE)
2517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2518 }
2519 else {
2520 data->start_class->flags |= ANYOF_LOCALE;
2521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2522 }
2523 break;
2524 case SPACE:
2525 if (flags & SCF_DO_STCLASS_AND) {
2526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2528 for (value = 0; value < 256; value++)
2529 if (!isSPACE(value))
2530 ANYOF_BITMAP_CLEAR(data->start_class, value);
2531 }
2532 }
2533 else {
2534 if (data->start_class->flags & ANYOF_LOCALE)
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2536 else {
2537 for (value = 0; value < 256; value++)
2538 if (isSPACE(value))
b81d288d 2539 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2540 }
2541 }
2542 break;
2543 case SPACEL:
2544 if (flags & SCF_DO_STCLASS_AND) {
2545 if (data->start_class->flags & ANYOF_LOCALE)
2546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2547 }
2548 else {
2549 data->start_class->flags |= ANYOF_LOCALE;
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2551 }
2552 break;
2553 case NSPACE:
2554 if (flags & SCF_DO_STCLASS_AND) {
2555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2557 for (value = 0; value < 256; value++)
2558 if (isSPACE(value))
2559 ANYOF_BITMAP_CLEAR(data->start_class, value);
2560 }
2561 }
2562 else {
2563 if (data->start_class->flags & ANYOF_LOCALE)
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2565 else {
2566 for (value = 0; value < 256; value++)
2567 if (!isSPACE(value))
b81d288d 2568 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2569 }
2570 }
2571 break;
2572 case NSPACEL:
2573 if (flags & SCF_DO_STCLASS_AND) {
2574 if (data->start_class->flags & ANYOF_LOCALE) {
2575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2576 for (value = 0; value < 256; value++)
2577 if (!isSPACE(value))
2578 ANYOF_BITMAP_CLEAR(data->start_class, value);
2579 }
2580 }
2581 else {
2582 data->start_class->flags |= ANYOF_LOCALE;
2583 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2584 }
2585 break;
2586 case DIGIT:
2587 if (flags & SCF_DO_STCLASS_AND) {
2588 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2589 for (value = 0; value < 256; value++)
2590 if (!isDIGIT(value))
2591 ANYOF_BITMAP_CLEAR(data->start_class, value);
2592 }
2593 else {
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2596 else {
2597 for (value = 0; value < 256; value++)
2598 if (isDIGIT(value))
b81d288d 2599 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2600 }
2601 }
2602 break;
2603 case NDIGIT:
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2606 for (value = 0; value < 256; value++)
2607 if (isDIGIT(value))
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2609 }
2610 else {
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2613 else {
2614 for (value = 0; value < 256; value++)
2615 if (!isDIGIT(value))
b81d288d 2616 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2617 }
2618 }
2619 break;
2620 }
2621 if (flags & SCF_DO_STCLASS_OR)
2622 cl_and(data->start_class, &and_with);
2623 flags &= ~SCF_DO_STCLASS;
2624 }
a0ed51b3 2625 }
22c35a8c 2626 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2627 data->flags |= (OP(scan) == MEOL
2628 ? SF_BEFORE_MEOL
2629 : SF_BEFORE_SEOL);
a0ed51b3 2630 }
653099ff
GS
2631 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2632 /* Lookbehind, or need to calculate parens/evals/stclass: */
2633 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2634 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2635 /* Lookahead/lookbehind */
cb434fcc 2636 I32 deltanext, minnext, fake = 0;
c277df42 2637 regnode *nscan;
653099ff
GS
2638 struct regnode_charclass_class intrnl;
2639 int f = 0;
c277df42
IZ
2640
2641 data_fake.flags = 0;
b81d288d 2642 if (data) {
2c2d71f5 2643 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2644 data_fake.last_closep = data->last_closep;
2645 }
2646 else
2647 data_fake.last_closep = &fake;
653099ff
GS
2648 if ( flags & SCF_DO_STCLASS && !scan->flags
2649 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2650 cl_init(pRExC_state, &intrnl);
653099ff 2651 data_fake.start_class = &intrnl;
e1901655 2652 f |= SCF_DO_STCLASS_AND;
653099ff 2653 }
e1901655
IZ
2654 if (flags & SCF_WHILEM_VISITED_POS)
2655 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2656 next = regnext(scan);
2657 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2658 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2659 if (scan->flags) {
2660 if (deltanext) {
9baa0206 2661 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2662 }
2663 else if (minnext > U8_MAX) {
9baa0206 2664 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2665 }
eb160463 2666 scan->flags = (U8)minnext;
c277df42
IZ
2667 }
2668 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2669 pars++;
405ff068 2670 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2671 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2672 if (data)
2673 data->whilem_c = data_fake.whilem_c;
e1901655 2674 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2675 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2676
2677 cl_and(data->start_class, &intrnl);
2678 if (was)
2679 data->start_class->flags |= ANYOF_EOS;
2680 }
a0ed51b3
LW
2681 }
2682 else if (OP(scan) == OPEN) {
c277df42 2683 pars++;
a0ed51b3 2684 }
cb434fcc 2685 else if (OP(scan) == CLOSE) {
eb160463 2686 if ((I32)ARG(scan) == is_par) {
cb434fcc 2687 next = regnext(scan);
c277df42 2688
cb434fcc
IZ
2689 if ( next && (OP(next) != WHILEM) && next < last)
2690 is_par = 0; /* Disable optimization */
2691 }
2692 if (data)
2693 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2694 }
2695 else if (OP(scan) == EVAL) {
c277df42
IZ
2696 if (data)
2697 data->flags |= SF_HAS_EVAL;
2698 }
96776eda 2699 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2700 if (flags & SCF_DO_SUBSTR) {
830247a4 2701 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2702 data->longest = &(data->longest_float);
2703 }
2704 is_inf = is_inf_internal = 1;
653099ff 2705 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2706 cl_anything(pRExC_state, data->start_class);
96776eda 2707 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2708 }
c277df42
IZ
2709 /* Else: zero-length, ignore. */
2710 scan = regnext(scan);
2711 }
2712
2713 finish:
2714 *scanp = scan;
aca2d497 2715 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2716 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2717 data->pos_delta = I32_MAX - data->pos_min;
2718 if (is_par > U8_MAX)
2719 is_par = 0;
2720 if (is_par && pars==1 && data) {
2721 data->flags |= SF_IN_PAR;
2722 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2723 }
2724 else if (pars && data) {
c277df42
IZ
2725 data->flags |= SF_HAS_PAR;
2726 data->flags &= ~SF_IN_PAR;
2727 }
653099ff
GS
2728 if (flags & SCF_DO_STCLASS_OR)
2729 cl_and(data->start_class, &and_with);
c277df42
IZ
2730 return min;
2731}
2732
76e3520e 2733STATIC I32
5f66b61c 2734S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2735{
830247a4 2736 if (RExC_rx->data) {
b81d288d
AB
2737 Renewc(RExC_rx->data,
2738 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2739 char, struct reg_data);
830247a4
IZ
2740 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2741 RExC_rx->data->count += n;
a0ed51b3
LW
2742 }
2743 else {
a02a5408 2744 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2745 char, struct reg_data);
a02a5408 2746 Newx(RExC_rx->data->what, n, U8);
830247a4 2747 RExC_rx->data->count = n;
c277df42 2748 }
830247a4
IZ
2749 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2750 return RExC_rx->data->count - n;
c277df42
IZ
2751}
2752
76234dfb 2753#ifndef PERL_IN_XSUB_RE
d88dccdf 2754void
864dbfa3 2755Perl_reginitcolors(pTHX)
d88dccdf 2756{
97aff369 2757 dVAR;
1df70142 2758 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2759 if (s) {
1df70142
AL
2760 char *t = savepv(s);
2761 int i = 0;
2762 PL_colors[0] = t;
d88dccdf 2763 while (++i < 6) {
1df70142
AL
2764 t = strchr(t, '\t');
2765 if (t) {
2766 *t = '\0';
2767 PL_colors[i] = ++t;
d88dccdf
IZ
2768 }
2769 else
1df70142 2770 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2771 }
2772 } else {
1df70142 2773 int i = 0;
b81d288d 2774 while (i < 6)
06b5626a 2775 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2776 }
2777 PL_colorset = 1;
2778}
76234dfb 2779#endif
8615cb43 2780
a687059c 2781/*
e50aee73 2782 - pregcomp - compile a regular expression into internal code
a687059c
LW
2783 *
2784 * We can't allocate space until we know how big the compiled form will be,
2785 * but we can't compile it (and thus know how big it is) until we've got a
2786 * place to put the code. So we cheat: we compile it twice, once with code
2787 * generation turned off and size counting turned on, and once "for real".
2788 * This also means that we don't allocate space until we are sure that the
2789 * thing really will compile successfully, and we never have to move the
2790 * code and thus invalidate pointers into it. (Note that it has to be in
2791 * one piece because free() must be able to free it all.) [NB: not true in perl]
2792 *
2793 * Beware that the optimization-preparation code in here knows about some
2794 * of the structure of the compiled regexp. [I'll say.]
2795 */
2796regexp *
864dbfa3 2797Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2798{
97aff369 2799 dVAR;
a0d0e21e 2800 register regexp *r;
c277df42 2801 regnode *scan;
c277df42 2802 regnode *first;
a0d0e21e 2803 I32 flags;
a0d0e21e
LW
2804 I32 minlen = 0;
2805 I32 sawplus = 0;
2806 I32 sawopen = 0;
2c2d71f5 2807 scan_data_t data;
830247a4
IZ
2808 RExC_state_t RExC_state;
2809 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2810
a3621e74
YO
2811 GET_RE_DEBUG_FLAGS_DECL;
2812
a0d0e21e 2813 if (exp == NULL)
c277df42 2814 FAIL("NULL regexp argument");
a0d0e21e 2815
a5961de5 2816 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2817
5cfc7842 2818 RExC_precomp = exp;
a3621e74
YO
2819 DEBUG_r(if (!PL_colorset) reginitcolors());
2820 DEBUG_COMPILE_r({
2821 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2822 PL_colors[4],PL_colors[5],PL_colors[0],
2823 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2824 });
e2509266 2825 RExC_flags = pm->op_pmflags;
830247a4 2826 RExC_sawback = 0;
bbce6d69 2827
830247a4
IZ
2828 RExC_seen = 0;
2829 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2830 RExC_seen_evals = 0;
2831 RExC_extralen = 0;
c277df42 2832
bbce6d69 2833 /* First pass: determine size, legality. */
830247a4 2834 RExC_parse = exp;
fac92740 2835 RExC_start = exp;
830247a4
IZ
2836 RExC_end = xend;
2837 RExC_naughty = 0;
2838 RExC_npar = 1;
2839 RExC_size = 0L;
2840 RExC_emit = &PL_regdummy;
2841 RExC_whilem_seen = 0;
85ddcde9
JH
2842#if 0 /* REGC() is (currently) a NOP at the first pass.
2843 * Clever compilers notice this and complain. --jhi */
830247a4 2844 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2845#endif
830247a4 2846 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2847 RExC_precomp = NULL;
a0d0e21e
LW
2848 return(NULL);
2849 }
a3621e74 2850 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2851
c277df42
IZ
2852 /* Small enough for pointer-storage convention?
2853 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2854 if (RExC_size >= 0x10000L && RExC_extralen)
2855 RExC_size += RExC_extralen;
c277df42 2856 else
830247a4
IZ
2857 RExC_extralen = 0;
2858 if (RExC_whilem_seen > 15)
2859 RExC_whilem_seen = 15;
a0d0e21e 2860
bbce6d69 2861 /* Allocate space and initialize. */
a02a5408 2862 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2863 char, regexp);
a0d0e21e 2864 if (r == NULL)
b45f050a
JF
2865 FAIL("Regexp out of space");
2866
0f79a09d
GS
2867#ifdef DEBUGGING
2868 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2869 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2870#endif
c277df42 2871 r->refcnt = 1;
bbce6d69 2872 r->prelen = xend - exp;
5cfc7842 2873 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2874 r->subbeg = NULL;
f8c7b90f 2875#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2876 r->saved_copy = NULL;
ed252734 2877#endif
cf93c79d 2878 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2879 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 2880 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
2881
2882 r->substrs = 0; /* Useful during FAIL. */
2883 r->startp = 0; /* Useful during FAIL. */
2884 r->endp = 0; /* Useful during FAIL. */
2885
a02a5408 2886 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2887 if (r->offsets) {
2af232bd 2888 r->offsets[0] = RExC_size;
fac92740 2889 }
a3621e74 2890 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SPS
2891 "%s %"UVuf" bytes for offset annotations.\n",
2892 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2893 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2894
830247a4 2895 RExC_rx = r;
bbce6d69
PP
2896
2897 /* Second pass: emit code. */
e2509266 2898 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2899 RExC_parse = exp;
2900 RExC_end = xend;
2901 RExC_naughty = 0;
2902 RExC_npar = 1;
fac92740 2903 RExC_emit_start = r->program;
830247a4 2904 RExC_emit = r->program;
2cd61cdb 2905 /* Store the count of eval-groups for security checks: */
eb160463 2906 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2907 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2908 r->data = 0;
830247a4 2909 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2910 return(NULL);
2911
a3621e74 2912
a0d0e21e 2913 /* Dig out information for optimizations. */
cf93c79d 2914 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2915 pm->op_pmflags = RExC_flags;
a0ed51b3 2916 if (UTF)
5ff6fc6d 2917 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2918 r->regstclass = NULL;
830247a4 2919 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2920 r->reganch |= ROPT_NAUGHTY;
c277df42 2921 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2922
2923 /* XXXX To minimize changes to RE engine we always allocate
2924 3-units-long substrs field. */
a02a5408 2925 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2926
2c2d71f5 2927 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2928 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2929 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2930 I32 fake;
c5254dd6 2931 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2932 struct regnode_charclass_class ch_class;
2933 int stclass_flag;
cb434fcc 2934 I32 last_close = 0;
a0d0e21e
LW
2935
2936 first = scan;
c277df42 2937 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2938 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2939 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2940 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2941 (OP(first) == PLUS) ||
2942 (OP(first) == MINMOD) ||
653099ff 2943 /* An {n,m} with n>0 */
22c35a8c 2944 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2945 if (OP(first) == PLUS)
2946 sawplus = 1;
2947 else
2948 first += regarglen[(U8)OP(first)];
2949 first = NEXTOPER(first);
a687059c
LW
2950 }
2951
a0d0e21e
LW
2952 /* Starting-point info. */
2953 again:
653099ff 2954 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b 2955 if (OP(first) == EXACT)
bb263b4e 2956 /*EMPTY*/; /* Empty, get anchored substr later. */
1aa99e6b 2957 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2958 r->regstclass = first;
2959 }
bfed75c6 2960 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2961 r->regstclass = first;
22c35a8c
GS
2962 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2963 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2964 r->regstclass = first;
22c35a8c 2965 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2966 r->reganch |= (OP(first) == MBOL
2967 ? ROPT_ANCH_MBOL
2968 : (OP(first) == SBOL
2969 ? ROPT_ANCH_SBOL
2970 : ROPT_ANCH_BOL));
a0d0e21e 2971 first = NEXTOPER(first);
774d564b
PP
2972 goto again;
2973 }
2974 else if (OP(first) == GPOS) {
2975 r->reganch |= ROPT_ANCH_GPOS;
2976 first = NEXTOPER(first);
2977 goto again;
a0d0e21e 2978 }
e09294f4 2979 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2980 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2981 !(r->reganch & ROPT_ANCH) )
2982 {
2983 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2984 const int type =
2985 (OP(NEXTOPER(first)) == REG_ANY)
2986 ? ROPT_ANCH_MBOL
2987 : ROPT_ANCH_SBOL;
cad2e5aa 2988 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2989 first = NEXTOPER(first);
774d564b 2990 goto again;
a0d0e21e 2991 }
b81d288d 2992 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 2993 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
2994 /* x+ must match at the 1st pos of run of x's */
2995 r->reganch |= ROPT_SKIP;
a0d0e21e 2996
c277df42 2997 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 2998 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 2999 (IV)(first - scan + 1)));
a0d0e21e
LW
3000 /*
3001 * If there's something expensive in the r.e., find the
3002 * longest literal string that must appear and make it the
3003 * regmust. Resolve ties in favor of later strings, since
3004 * the regstart check works with the beginning of the r.e.
3005 * and avoiding duplication strengthens checking. Not a
3006 * strong reason, but sufficient in the absence of others.
3007 * [Now we resolve ties in favor of the earlier string if
c277df42 3008 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3009 * earlier string may buy us something the later one won't.]
3010 */
a0d0e21e 3011 minlen = 0;
a687059c 3012
396482e1
GA
3013 data.longest_fixed = newSVpvs("");
3014 data.longest_float = newSVpvs("");
3015 data.last_found = newSVpvs("");
c277df42
IZ
3016 data.longest = &(data.longest_fixed);
3017 first = scan;
653099ff 3018 if (!r->regstclass) {
830247a4 3019 cl_init(pRExC_state, &ch_class);
653099ff
GS
3020 data.start_class = &ch_class;
3021 stclass_flag = SCF_DO_STCLASS_AND;
3022 } else /* XXXX Check for BOUND? */
3023 stclass_flag = 0;
cb434fcc 3024 data.last_closep = &last_close;
653099ff 3025
830247a4 3026 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3027 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3028 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3029 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3030 && !RExC_seen_zerolen
3031 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3032 r->reganch |= ROPT_CHECK_ALL;
830247a4 3033 scan_commit(pRExC_state, &data);
c277df42
IZ
3034 SvREFCNT_dec(data.last_found);
3035
a0ed51b3 3036 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3037 if (longest_float_length
c277df42
IZ
3038 || (data.flags & SF_FL_BEFORE_EOL
3039 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3040 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3041 int t;
3042
a0ed51b3 3043 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3044 && data.offset_fixed == data.offset_float_min
3045 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3046 goto remove_float; /* As in (a)+. */
3047
33b8afdf
JH
3048 if (SvUTF8(data.longest_float)) {
3049 r->float_utf8 = data.longest_float;
c445ea15 3050 r->float_substr = NULL;
33b8afdf
JH
3051 } else {
3052 r->float_substr = data.longest_float;
c445ea15 3053 r->float_utf8 = NULL;
33b8afdf 3054 }
c277df42
IZ
3055 r->float_min_offset = data.offset_float_min;
3056 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3057 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3058 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3059 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3060 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3061 }
3062 else {
aca2d497 3063 remove_float:
c445ea15 3064 r->float_substr = r->float_utf8 = NULL;
c277df42 3065 SvREFCNT_dec(data.longest_float);
c5254dd6 3066 longest_float_length = 0;
a0d0e21e 3067 }
c277df42 3068
a0ed51b3 3069 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3070 if (longest_fixed_length
c277df42
IZ
3071 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3072 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3073 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3074 int t;
3075
33b8afdf
JH
3076 if (SvUTF8(data.longest_fixed)) {
3077 r->anchored_utf8 = data.longest_fixed;
c445ea15 3078 r->anchored_substr = NULL;
33b8afdf
JH
3079 } else {
3080 r->anchored_substr = data.longest_fixed;
c445ea15 3081 r->anchored_utf8 = NULL;
33b8afdf 3082 }
c277df42 3083 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3084 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3087 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3088 }
3089 else {
c445ea15 3090 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3091 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3092 longest_fixed_length = 0;
a0d0e21e 3093 }
b81d288d 3094 if (r->regstclass
ffc61ed2 3095 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3096 r->regstclass = NULL;
33b8afdf
JH
3097 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3098 && stclass_flag
653099ff 3099 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3100 && !cl_is_anything(data.start_class))
3101 {
1df70142 3102 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3103
a02a5408 3104 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3105 struct regnode_charclass_class);
3106 StructCopy(data.start_class,
830247a4 3107 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3108 struct regnode_charclass_class);
830247a4 3109 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3110 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3111 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3112 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3113 PerlIO_printf(Perl_debug_log,
a0288114 3114 "synthetic stclass \"%s\".\n",
3f7c398e 3115 SvPVX_const(sv));});
653099ff 3116 }
c277df42
IZ
3117
3118 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3119 if (longest_fixed_length > longest_float_length) {
c277df42 3120 r->check_substr = r->anchored_substr;
33b8afdf 3121 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3122 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3123 if (r->reganch & ROPT_ANCH_SINGLE)
3124 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3125 }
3126 else {
c277df42 3127 r->check_substr = r->float_substr;
33b8afdf 3128 r->check_utf8 = r->float_utf8;
c277df42
IZ
3129 r->check_offset_min = data.offset_float_min;
3130 r->check_offset_max = data.offset_float_max;
a0d0e21e 3131 }
30382c73
IZ
3132 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3133 This should be changed ASAP! */
33b8afdf 3134 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3135 r->reganch |= RE_USE_INTUIT;
33b8afdf 3136 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3137 r->reganch |= RE_INTUIT_TAIL;
3138 }
a0ed51b3
LW
3139 }
3140 else {
c277df42
IZ
3141 /* Several toplevels. Best we can is to set minlen. */
3142 I32 fake;
653099ff 3143 struct regnode_charclass_class ch_class;
cb434fcc 3144 I32 last_close = 0;
c277df42 3145
a3621e74 3146 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3147 scan = r->program + 1;
830247a4 3148 cl_init(pRExC_state, &ch_class);
653099ff 3149 data.start_class = &ch_class;
cb434fcc 3150 data.last_closep = &last_close;
a3621e74 3151 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3152 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3153 = r->float_substr = r->float_utf8 = NULL;
653099ff 3154 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3155 && !cl_is_anything(data.start_class))
3156 {
1df70142 3157 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3158
a02a5408 3159 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3160 struct regnode_charclass_class);
3161 StructCopy(data.start_class,
830247a4 3162 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3163 struct regnode_charclass_class);
830247a4 3164 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3165 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3166 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3167 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3168 PerlIO_printf(Perl_debug_log,
a0288114 3169 "synthetic stclass \"%s\".\n",
3f7c398e 3170 SvPVX_const(sv));});
653099ff 3171 }
a0d0e21e
LW
3172 }
3173
a0d0e21e 3174 r->minlen = minlen;
b81d288d 3175 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3176 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3177 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3178 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3179 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3180 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3181 if (RExC_seen & REG_SEEN_CANY)
3182 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3183 Newxz(r->startp, RExC_npar, I32);
3184 Newxz(r->endp, RExC_npar, I32);
a3621e74 3185 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3186 return(r);
a687059c
LW
3187}
3188
3189/*
3190 - reg - regular expression, i.e. main body or parenthesized thing
3191 *
3192 * Caller must absorb opening parenthesis.
3193 *
3194 * Combining parenthesis handling with the base level of regular expression
3195 * is a trifle forced, but the need to tie the tails of the branches to what
3196 * follows makes it hard to avoid.
3197 */
76e3520e 3198STATIC regnode *
830247a4 3199S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3200 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3201{
27da23d5 3202 dVAR;
c277df42
IZ
3203 register regnode *ret; /* Will be the head of the group. */
3204 register regnode *br;
3205 register regnode *lastbr;
cbbf8932 3206 register regnode *ender = NULL;
a0d0e21e 3207 register I32 parno = 0;
cbbf8932
AL
3208 I32 flags;
3209 const I32 oregflags = RExC_flags;
6136c704
AL
3210 bool have_branch = 0;
3211 bool is_open = 0;
9d1d55b5
JP
3212
3213 /* for (?g), (?gc), and (?o) warnings; warning
3214 about (?c) will warn about (?g) -- japhy */
3215
6136c704
AL
3216#define WASTED_O 0x01
3217#define WASTED_G 0x02
3218#define WASTED_C 0x04
3219#define WASTED_GC (0x02|0x04)
cbbf8932 3220 I32 wastedflags = 0x00;
9d1d55b5 3221
fac92740 3222 char * parse_start = RExC_parse; /* MJD */
a28509cc 3223 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3224
821b33a5 3225 *flagp = 0; /* Tentatively. */
a0d0e21e 3226
9d1d55b5 3227
a0d0e21e
LW
3228 /* Make an OPEN node, if parenthesized. */
3229 if (paren) {
fac92740 3230 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3231 U32 posflags = 0, negflags = 0;
3232 U32 *flagsp = &posflags;
6136c704 3233 bool is_logical = 0;
a28509cc 3234 const char * const seqstart = RExC_parse;
ca9dfc88 3235
830247a4
IZ
3236 RExC_parse++;
3237 paren = *RExC_parse++;
c277df42 3238 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3239 switch (paren) {
fac92740 3240 case '<': /* (?<...) */
830247a4 3241 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3242 if (*RExC_parse == '!')
c277df42 3243 paren = ',';
b81d288d 3244 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3245 goto unknown;
830247a4 3246 RExC_parse++;
fac92740
MJD
3247 case '=': /* (?=...) */
3248 case '!': /* (?!...) */
830247a4 3249 RExC_seen_zerolen++;
fac92740
MJD
3250 case ':': /* (?:...) */
3251 case '>': /* (?>...) */
a0d0e21e 3252 break;
fac92740
MJD
3253 case '$': /* (?$...) */
3254 case '@': /* (?@...) */
8615cb43 3255 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3256 break;
fac92740 3257 case '#': /* (?#...) */
830247a4
IZ
3258 while (*RExC_parse && *RExC_parse != ')')
3259 RExC_parse++;
3260 if (*RExC_parse != ')')
c277df42 3261 FAIL("Sequence (?#... not terminated");
830247a4 3262 nextchar(pRExC_state);
a0d0e21e
LW
3263 *flagp = TRYAGAIN;
3264 return NULL;
fac92740 3265 case 'p': /* (?p...) */
9014280d 3266 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3267 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3268 /* FALL THROUGH*/
fac92740 3269 case '?': /* (??...) */
6136c704 3270 is_logical = 1;
438a3801
YST
3271 if (*RExC_parse != '{')
3272 goto unknown;
830247a4 3273 paren = *RExC_parse++;
0f5d15d6 3274 /* FALL THROUGH */
fac92740 3275 case '{': /* (?{...}) */
c277df42 3276 {
c277df42
IZ
3277 I32 count = 1, n = 0;
3278 char c;
830247a4 3279 char *s = RExC_parse;
c277df42 3280
830247a4
IZ
3281 RExC_seen_zerolen++;
3282 RExC_seen |= REG_SEEN_EVAL;
3283 while (count && (c = *RExC_parse)) {
6136c704
AL
3284 if (c == '\\') {
3285 if (RExC_parse[1])
3286 RExC_parse++;
3287 }
b81d288d 3288 else if (c == '{')
c277df42 3289 count++;
b81d288d 3290 else if (c == '}')
c277df42 3291 count--;
830247a4 3292 RExC_parse++;
c277df42 3293 }
6136c704 3294 if (*RExC_parse != ')') {
b81d288d 3295 RExC_parse = s;
b45f050a
JF
3296 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3297 }
c277df42 3298 if (!SIZE_ONLY) {
f3548bdc 3299 PAD *pad;
6136c704
AL
3300 OP_4tree *sop, *rop;
3301 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3302
569233ed
SB
3303 ENTER;
3304 Perl_save_re_context(aTHX);
f3548bdc 3305 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3306 sop->op_private |= OPpREFCOUNTED;
3307 /* re_dup will OpREFCNT_inc */
3308 OpREFCNT_set(sop, 1);
569233ed 3309 LEAVE;
c277df42 3310
830247a4
IZ
3311 n = add_data(pRExC_state, 3, "nop");
3312 RExC_rx->data->data[n] = (void*)rop;
3313 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3314 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3315 SvREFCNT_dec(sv);
a0ed51b3 3316 }
e24b16f9 3317 else { /* First pass */
830247a4 3318 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3319 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3320 /* No compiled RE interpolated, has runtime
3321 components ===> unsafe. */
3322 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3323 if (PL_tainting && PL_tainted)
cc6b7395 3324 FAIL("Eval-group in insecure regular expression");
923e4eb5 3325 if (IN_PERL_COMPILETIME)
b5c19bd7 3326 PL_cv_has_eval = 1;
c277df42 3327 }
b5c19bd7 3328
830247a4 3329 nextchar(pRExC_state);
6136c704 3330 if (is_logical) {
830247a4 3331 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3332 if (!SIZE_ONLY)
3333 ret->flags = 2;
830247a4 3334 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3335 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3336 return ret;
3337 }
ccb2c380
MP
3338 ret = reganode(pRExC_state, EVAL, n);
3339 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3340 Set_Node_Offset(ret, parse_start);
3341 return ret;
c277df42 3342 }
fac92740 3343 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3344 {
fac92740 3345 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3346 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3347 || RExC_parse[1] == '<'
830247a4 3348 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3349 I32 flag;
3350
830247a4 3351 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3352 if (!SIZE_ONLY)
3353 ret->flags = 1;
830247a4 3354 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3355 goto insert_if;
b81d288d 3356 }
a0ed51b3 3357 }
830247a4 3358 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3359 /* (?(1)...) */
6136c704 3360 char c;
830247a4 3361 parno = atoi(RExC_parse++);
c277df42 3362
830247a4
IZ
3363 while (isDIGIT(*RExC_parse))
3364 RExC_parse++;
fac92740 3365 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3366
830247a4 3367 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3368 vFAIL("Switch condition not recognized");
c277df42 3369 insert_if:
830247a4
IZ
3370 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3371 br = regbranch(pRExC_state, &flags, 1);
c277df42 3372 if (br == NULL)
830247a4 3373 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3374 else
830247a4
IZ
3375 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3376 c = *nextchar(pRExC_state);
d1b80229
IZ
3377 if (flags&HASWIDTH)
3378 *flagp |= HASWIDTH;
c277df42 3379 if (c == '|') {
830247a4
IZ
3380 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3381 regbranch(pRExC_state, &flags, 1);
3382 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3383 if (flags&HASWIDTH)
3384 *flagp |= HASWIDTH;
830247a4 3385 c = *nextchar(pRExC_state);
a0ed51b3
LW
3386 }
3387 else
c277df42
IZ
3388 lastbr = NULL;
3389 if (c != ')')
8615cb43 3390 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3391 ender = reg_node(pRExC_state, TAIL);
3392 regtail(pRExC_state, br, ender);
c277df42 3393 if (lastbr) {
830247a4
IZ
3394 regtail(pRExC_state, lastbr, ender);
3395 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3396 }
3397 else
830247a4 3398 regtail(pRExC_state, ret, ender);
c277df42 3399 return ret;
a0ed51b3
LW
3400 }
3401 else {
830247a4 3402 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3403 }
3404 }
1b1626e4 3405 case 0:
830247a4 3406 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3407 vFAIL("Sequence (? incomplete");
1b1626e4 3408 break;
a0d0e21e 3409 default:
830247a4 3410 --RExC_parse;
fac92740 3411 parse_flags: /* (?i) */
830247a4 3412 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3413 /* (?g), (?gc) and (?o) are useless here
3414 and must be globally applied -- japhy */
3415
3416 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3417 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3418 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3419 if (! (wastedflags & wflagbit) ) {
3420 wastedflags |= wflagbit;
3421 vWARN5(
3422 RExC_parse + 1,
3423 "Useless (%s%c) - %suse /%c modifier",
3424 flagsp == &negflags ? "?-" : "?",
3425 *RExC_parse,
3426 flagsp == &negflags ? "don't " : "",
3427 *RExC_parse
3428 );
3429 }
3430 }
3431 }
3432 else if (*RExC_parse == 'c') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3434 if (! (wastedflags & WASTED_C) ) {
3435 wastedflags |= WASTED_GC;
9d1d55b5
JP
3436 vWARN3(
3437 RExC_parse + 1,
3438 "Useless (%sc) - %suse /gc modifier",
3439 flagsp == &negflags ? "?-" : "?",
3440 flagsp == &negflags ? "don't " : ""
3441 );
3442 }
3443 }
3444 }
3445 else { pmflag(flagsp, *RExC_parse); }
3446
830247a4 3447 ++RExC_parse;
ca9dfc88 3448 }
830247a4 3449 if (*RExC_parse == '-') {
ca9dfc88 3450 flagsp = &negflags;
9d1d55b5 3451 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3452 ++RExC_parse;
ca9dfc88 3453 goto parse_flags;
48c036b1 3454 }
e2509266
JH
3455 RExC_flags |= posflags;
3456 RExC_flags &= ~negflags;
830247a4
IZ
3457 if (*RExC_parse == ':') {
3458 RExC_parse++;
ca9dfc88
IZ
3459 paren = ':';
3460 break;
3461 }
c277df42 3462 unknown:
830247a4
IZ
3463 if (*RExC_parse != ')') {
3464 RExC_parse++;
3465 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3466 }
830247a4 3467 nextchar(pRExC_state);
a0d0e21e
LW
3468 *flagp = TRYAGAIN;
3469 return NULL;
3470 }
3471 }
fac92740 3472 else { /* (...) */
830247a4
IZ
3473 parno = RExC_npar;
3474 RExC_npar++;
3475 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3476 Set_Node_Length(ret, 1); /* MJD */
3477 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3478 is_open = 1;
a0d0e21e 3479 }
a0ed51b3 3480 }
fac92740 3481 else /* ! paren */
a0d0e21e
LW
3482 ret = NULL;
3483
3484 /* Pick up the branches, linking them together. */
fac92740 3485 parse_start = RExC_parse; /* MJD */
830247a4 3486 br = regbranch(pRExC_state, &flags, 1);
fac92740 3487 /* branch_len = (paren != 0); */
2af232bd 3488
a0d0e21e
LW
3489 if (br == NULL)
3490 return(NULL);
830247a4
IZ
3491 if (*RExC_parse == '|') {
3492 if (!SIZE_ONLY && RExC_extralen) {
3493 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3494 }
fac92740 3495 else { /* MJD */
830247a4 3496 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3497 Set_Node_Length(br, paren != 0);
3498 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3499 }
c277df42
IZ
3500 have_branch = 1;
3501 if (SIZE_ONLY)
830247a4 3502 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3503 }
3504 else if (paren == ':') {
c277df42
IZ
3505 *flagp |= flags&SIMPLE;
3506 }
6136c704 3507 if (is_open) { /* Starts with OPEN. */
830247a4 3508 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3509 }
3510 else if (paren != '?') /* Not Conditional */
a0d0e21e 3511 ret = br;
32a0ca98 3512 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3513 lastbr = br;
830247a4
IZ
3514 while (*RExC_parse == '|') {
3515 if (!SIZE_ONLY && RExC_extralen) {
3516 ender = reganode(pRExC_state, LONGJMP,0);
3517 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3518 }
3519 if (SIZE_ONLY)
830247a4
IZ
3520 RExC_extralen += 2; /* Account for LONGJMP. */
3521 nextchar(pRExC_state);
3522 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3523
a687059c 3524 if (br == NULL)
a0d0e21e 3525 return(NULL);
830247a4 3526 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3527 lastbr = br;
821b33a5
IZ
3528 if (flags&HASWIDTH)
3529 *flagp |= HASWIDTH;
a687059c 3530 *flagp |= flags&SPSTART;
a0d0e21e
LW
3531 }
3532
c277df42
IZ
3533 if (have_branch || paren != ':') {
3534 /* Make a closing node, and hook it on the end. */
3535 switch (paren) {
3536 case ':':
830247a4 3537 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3538 break;
3539 case 1:
830247a4 3540 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3541 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3542 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3543 break;
3544 case '<':
c277df42
IZ
3545 case ',':
3546 case '=':
3547 case '!':
c277df42 3548 *flagp &= ~HASWIDTH;
821b33a5
IZ
3549 /* FALL THROUGH */
3550 case '>':
830247a4 3551 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3552 break;
3553 case 0:
830247a4 3554 ender = reg_node(pRExC_state, END);
c277df42
IZ
3555 break;
3556 }
830247a4 3557 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3558
c277df42
IZ
3559 if (have_branch) {
3560 /* Hook the tails of the branches to the closing node. */
3561 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3562 regoptail(pRExC_state, br, ender);
c277df42
IZ
3563 }
3564 }
a0d0e21e