This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
gcc -Wall dewhine.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
653099ff 525 ANYOF_CLASS_ZERO(cl);
f8bef550 526 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* Can match anything (initialization) */
533STATIC int
534S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535{
536 int value;
537
aaa51d5e 538 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 return 1;
1aa99e6b
IH
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 return 0;
f8bef550
NC
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
544 return 0;
653099ff
GS
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
830247a4 550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 551{
8ecf7187 552 Zero(cl, 1, struct regnode_charclass_class);
653099ff 553 cl->type = ANYOF;
830247a4 554 cl_anything(pRExC_state, cl);
653099ff
GS
555}
556
557STATIC void
830247a4 558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
653099ff
GS
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
830247a4 604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 605{
653099ff
GS
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 630 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 644 cl_anything(pRExC_state, cl);
653099ff
GS
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
653099ff
GS
659}
660
5d1c421c
JH
661/*
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
664 */
665#if defined(__sparc64__) && defined(__GNUC__)
666# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667# undef SPARC64_GCC_WORKAROUND
668# define SPARC64_GCC_WORKAROUND 1
669# endif
670#endif
671
653099ff
GS
672/* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
674
c277df42
IZ
675/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
677
76e3520e 678STATIC I32
830247a4 679S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
683{
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
686 I32 delta = 0;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 688 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
653099ff 691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 692
c277df42
IZ
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
695
22c35a8c 696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 697 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
698 regnode *n = regnext(scan);
699 U32 stringok = 1;
700#ifdef DEBUGGING
701 regnode *stop = scan;
b81d288d 702#endif
c277df42 703
cd439c50 704 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
705 /* Skip NOTHING, merge EXACT*. */
706 while (n &&
b81d288d 707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
708 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(n)
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
712 stringok = 0;
22c35a8c 713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
716#ifdef DEBUGGING
717 if (stringok)
718 stop = n;
b81d288d 719#endif
c277df42 720 n = regnext(n);
a0ed51b3 721 }
f49d4d0f 722 else if (stringok) {
cd439c50 723 int oldl = STR_LEN(scan);
c277df42 724 regnode *nnext = regnext(n);
f49d4d0f 725
b81d288d 726 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
727 break;
728 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
c277df42 731 /* Now we can overwrite *n : */
f49d4d0f 732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 733#ifdef DEBUGGING
f49d4d0f 734 stop = next - 1;
b81d288d 735#endif
c277df42
IZ
736 n = nnext;
737 }
738 }
739#ifdef DEBUGGING
740 /* Allow dumping */
cd439c50 741 n = scan + NODE_SZ_STR(scan);
c277df42 742 while (n <= stop) {
22c35a8c 743 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
744 OP(n) = OPTIMIZED;
745 NEXT_OFF(n) = 0;
746 }
747 n++;
748 }
653099ff 749#endif
c277df42 750 }
653099ff
GS
751 /* Follow the next-chain of the current node and optimize
752 away all the NOTHINGs from it. */
c277df42 753 if (OP(scan) != CURLYX) {
048cfca1
GS
754 int max = (reg_off_by_arg[OP(scan)]
755 ? I32_MAX
756 /* I32 may be smaller than U16 on CRAYs! */
757 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
758 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
759 int noff;
760 regnode *n = scan;
b81d288d 761
c277df42
IZ
762 /* Skip NOTHING and LONGJMP. */
763 while ((n = regnext(n))
22c35a8c 764 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
765 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
766 && off + noff < max)
767 off += noff;
768 if (reg_off_by_arg[OP(scan)])
769 ARG(scan) = off;
b81d288d 770 else
c277df42
IZ
771 NEXT_OFF(scan) = off;
772 }
653099ff
GS
773 /* The principal pseudo-switch. Cannot be a switch, since we
774 look into several different things. */
b81d288d 775 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
776 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
777 next = regnext(scan);
778 code = OP(scan);
b81d288d
AB
779
780 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 781 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 782 struct regnode_charclass_class accum;
c277df42 783
653099ff 784 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 785 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 786 if (flags & SCF_DO_STCLASS)
830247a4 787 cl_init_zero(pRExC_state, &accum);
c277df42 788 while (OP(scan) == code) {
830247a4 789 I32 deltanext, minnext, f = 0, fake;
653099ff 790 struct regnode_charclass_class this_class;
c277df42
IZ
791
792 num++;
793 data_fake.flags = 0;
b81d288d 794 if (data) {
2c2d71f5 795 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
796 data_fake.last_closep = data->last_closep;
797 }
798 else
799 data_fake.last_closep = &fake;
c277df42
IZ
800 next = regnext(scan);
801 scan = NEXTOPER(scan);
802 if (code != BRANCH)
803 scan = NEXTOPER(scan);
653099ff 804 if (flags & SCF_DO_STCLASS) {
830247a4 805 cl_init(pRExC_state, &this_class);
653099ff
GS
806 data_fake.start_class = &this_class;
807 f = SCF_DO_STCLASS_AND;
b81d288d 808 }
e1901655
IZ
809 if (flags & SCF_WHILEM_VISITED_POS)
810 f |= SCF_WHILEM_VISITED_POS;
653099ff 811 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
812 minnext = study_chunk(pRExC_state, &scan, &deltanext,
813 next, &data_fake, f);
b81d288d 814 if (min1 > minnext)
c277df42
IZ
815 min1 = minnext;
816 if (max1 < minnext + deltanext)
817 max1 = minnext + deltanext;
818 if (deltanext == I32_MAX)
aca2d497 819 is_inf = is_inf_internal = 1;
c277df42
IZ
820 scan = next;
821 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
822 pars++;
405ff068 823 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 824 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
825 if (data)
826 data->whilem_c = data_fake.whilem_c;
653099ff 827 if (flags & SCF_DO_STCLASS)
830247a4 828 cl_or(pRExC_state, &accum, &this_class);
b81d288d 829 if (code == SUSPEND)
c277df42
IZ
830 break;
831 }
832 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
833 min1 = 0;
834 if (flags & SCF_DO_SUBSTR) {
835 data->pos_min += min1;
836 data->pos_delta += max1 - min1;
837 if (max1 != min1 || is_inf)
838 data->longest = &(data->longest_float);
839 }
840 min += min1;
841 delta += max1 - min1;
653099ff 842 if (flags & SCF_DO_STCLASS_OR) {
830247a4 843 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
844 if (min1) {
845 cl_and(data->start_class, &and_with);
846 flags &= ~SCF_DO_STCLASS;
847 }
848 }
849 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
850 if (min1) {
851 cl_and(data->start_class, &accum);
653099ff 852 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
853 }
854 else {
b81d288d 855 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
856 * data->start_class */
857 StructCopy(data->start_class, &and_with,
858 struct regnode_charclass_class);
859 flags &= ~SCF_DO_STCLASS_AND;
860 StructCopy(&accum, data->start_class,
861 struct regnode_charclass_class);
862 flags |= SCF_DO_STCLASS_OR;
863 data->start_class->flags |= ANYOF_EOS;
864 }
653099ff 865 }
a0ed51b3
LW
866 }
867 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
868 scan = NEXTOPER(NEXTOPER(scan));
869 else /* single branch is optimized. */
870 scan = NEXTOPER(scan);
871 continue;
a0ed51b3
LW
872 }
873 else if (OP(scan) == EXACT) {
cd439c50 874 I32 l = STR_LEN(scan);
1aa99e6b 875 UV uc = *((U8*)STRING(scan));
a0ed51b3 876 if (UTF) {
1aa99e6b
IH
877 U8 *s = (U8*)STRING(scan);
878 l = utf8_length(s, s + l);
9041c2e3 879 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
880 }
881 min += l;
c277df42 882 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
883 /* The code below prefers earlier match for fixed
884 offset, later match for variable offset. */
885 if (data->last_end == -1) { /* Update the start info. */
886 data->last_start_min = data->pos_min;
887 data->last_start_max = is_inf
b81d288d 888 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 889 }
cd439c50 890 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
891 data->last_end = data->pos_min + l;
892 data->pos_min += l; /* As in the first entry. */
893 data->flags &= ~SF_BEFORE_EOL;
894 }
653099ff
GS
895 if (flags & SCF_DO_STCLASS_AND) {
896 /* Check whether it is compatible with what we know already! */
897 int compat = 1;
898
1aa99e6b 899 if (uc >= 0x100 ||
516a5887 900 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 901 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 902 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 903 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 904 )
653099ff
GS
905 compat = 0;
906 ANYOF_CLASS_ZERO(data->start_class);
907 ANYOF_BITMAP_ZERO(data->start_class);
908 if (compat)
1aa99e6b 909 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 910 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
911 if (uc < 0x100)
912 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
913 }
914 else if (flags & SCF_DO_STCLASS_OR) {
915 /* false positive possible if the class is case-folded */
1aa99e6b 916 if (uc < 0x100)
9b877dbb
IH
917 ANYOF_BITMAP_SET(data->start_class, uc);
918 else
919 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
920 data->start_class->flags &= ~ANYOF_EOS;
921 cl_and(data->start_class, &and_with);
922 }
923 flags &= ~SCF_DO_STCLASS;
a0ed51b3 924 }
653099ff 925 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 926 I32 l = STR_LEN(scan);
1aa99e6b 927 UV uc = *((U8*)STRING(scan));
653099ff
GS
928
929 /* Search for fixed substrings supports EXACT only. */
b81d288d 930 if (flags & SCF_DO_SUBSTR)
830247a4 931 scan_commit(pRExC_state, data);
a0ed51b3 932 if (UTF) {
1aa99e6b
IH
933 U8 *s = (U8 *)STRING(scan);
934 l = utf8_length(s, s + l);
9041c2e3 935 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
936 }
937 min += l;
c277df42 938 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 939 data->pos_min += l;
653099ff
GS
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
942 int compat = 1;
943
1aa99e6b 944 if (uc >= 0x100 ||
516a5887 945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 947 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
948 compat = 0;
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
951 if (compat) {
1aa99e6b 952 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
953 data->start_class->flags &= ~ANYOF_EOS;
954 data->start_class->flags |= ANYOF_FOLD;
955 if (OP(scan) == EXACTFL)
956 data->start_class->flags |= ANYOF_LOCALE;
957 }
958 }
959 else if (flags & SCF_DO_STCLASS_OR) {
960 if (data->start_class->flags & ANYOF_FOLD) {
961 /* false positive possible if the class is case-folded.
962 Assume that the locale settings are the same... */
1aa99e6b
IH
963 if (uc < 0x100)
964 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
965 data->start_class->flags &= ~ANYOF_EOS;
966 }
967 cl_and(data->start_class, &and_with);
968 }
969 flags &= ~SCF_DO_STCLASS;
a0ed51b3 970 }
4d61ec05 971 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 972 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 973 I32 f = flags, pos_before = 0;
c277df42 974 regnode *oscan = scan;
653099ff
GS
975 struct regnode_charclass_class this_class;
976 struct regnode_charclass_class *oclass = NULL;
727f22e3 977 I32 next_is_eval = 0;
653099ff 978
22c35a8c 979 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 980 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
981 scan = NEXTOPER(scan);
982 goto finish;
983 case PLUS:
653099ff 984 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 985 next = NEXTOPER(scan);
653099ff 986 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
987 mincount = 1;
988 maxcount = REG_INFTY;
c277df42
IZ
989 next = regnext(scan);
990 scan = NEXTOPER(scan);
991 goto do_curly;
992 }
993 }
994 if (flags & SCF_DO_SUBSTR)
995 data->pos_min++;
996 min++;
997 /* Fall through. */
998 case STAR:
653099ff
GS
999 if (flags & SCF_DO_STCLASS) {
1000 mincount = 0;
b81d288d 1001 maxcount = REG_INFTY;
653099ff
GS
1002 next = regnext(scan);
1003 scan = NEXTOPER(scan);
1004 goto do_curly;
1005 }
b81d288d 1006 is_inf = is_inf_internal = 1;
c277df42
IZ
1007 scan = regnext(scan);
1008 if (flags & SCF_DO_SUBSTR) {
830247a4 1009 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1010 data->longest = &(data->longest_float);
1011 }
1012 goto optimize_curly_tail;
1013 case CURLY:
b81d288d 1014 mincount = ARG1(scan);
c277df42
IZ
1015 maxcount = ARG2(scan);
1016 next = regnext(scan);
cb434fcc
IZ
1017 if (OP(scan) == CURLYX) {
1018 I32 lp = (data ? *(data->last_closep) : 0);
1019
1020 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1021 }
c277df42 1022 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1023 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1024 do_curly:
1025 if (flags & SCF_DO_SUBSTR) {
830247a4 1026 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1027 pos_before = data->pos_min;
1028 }
1029 if (data) {
1030 fl = data->flags;
1031 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1032 if (is_inf)
1033 data->flags |= SF_IS_INF;
1034 }
653099ff 1035 if (flags & SCF_DO_STCLASS) {
830247a4 1036 cl_init(pRExC_state, &this_class);
653099ff
GS
1037 oclass = data->start_class;
1038 data->start_class = &this_class;
1039 f |= SCF_DO_STCLASS_AND;
1040 f &= ~SCF_DO_STCLASS_OR;
1041 }
e1901655
IZ
1042 /* These are the cases when once a subexpression
1043 fails at a particular position, it cannot succeed
1044 even after backtracking at the enclosing scope.
b81d288d 1045
e1901655
IZ
1046 XXXX what if minimal match and we are at the
1047 initial run of {n,m}? */
1048 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1049 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1050
c277df42 1051 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1052 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1053 mincount == 0
653099ff
GS
1054 ? (f & ~SCF_DO_SUBSTR) : f);
1055
1056 if (flags & SCF_DO_STCLASS)
1057 data->start_class = oclass;
1058 if (mincount == 0 || minnext == 0) {
1059 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1060 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1061 }
1062 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1063 /* Switch to OR mode: cache the old value of
653099ff
GS
1064 * data->start_class */
1065 StructCopy(data->start_class, &and_with,
1066 struct regnode_charclass_class);
1067 flags &= ~SCF_DO_STCLASS_AND;
1068 StructCopy(&this_class, data->start_class,
1069 struct regnode_charclass_class);
1070 flags |= SCF_DO_STCLASS_OR;
1071 data->start_class->flags |= ANYOF_EOS;
1072 }
1073 } else { /* Non-zero len */
1074 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1075 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1076 cl_and(data->start_class, &and_with);
1077 }
1078 else if (flags & SCF_DO_STCLASS_AND)
1079 cl_and(data->start_class, &this_class);
1080 flags &= ~SCF_DO_STCLASS;
1081 }
c277df42
IZ
1082 if (!scan) /* It was not CURLYX, but CURLY. */
1083 scan = next;
84037bb0 1084 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1085 /* ? quantifier ok, except for (?{ ... }) */
1086 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1087 && (minnext == 0) && (deltanext == 0)
99799961 1088 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1089 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1090 {
830247a4 1091 vWARN(RExC_parse,
b45f050a
JF
1092 "Quantifier unexpected on zero-length expression");
1093 }
1094
c277df42 1095 min += minnext * mincount;
b81d288d 1096 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1097 && (minnext + deltanext) > 0)
1098 || deltanext == I32_MAX);
aca2d497 1099 is_inf |= is_inf_internal;
c277df42
IZ
1100 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1101
1102 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1103 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1104 && data->flags & SF_IN_PAR
1105 && !(data->flags & SF_HAS_EVAL)
1106 && !deltanext && minnext == 1 ) {
1107 /* Try to optimize to CURLYN. */
1108 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1109 regnode *nxt1 = nxt;
1110#ifdef DEBUGGING
1111 regnode *nxt2;
1112#endif
c277df42
IZ
1113
1114 /* Skip open. */
1115 nxt = regnext(nxt);
4d61ec05 1116 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1117 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1118 && STR_LEN(nxt) == 1))
c277df42 1119 goto nogo;
497b47a8 1120#ifdef DEBUGGING
c277df42 1121 nxt2 = nxt;
497b47a8 1122#endif
c277df42 1123 nxt = regnext(nxt);
b81d288d 1124 if (OP(nxt) != CLOSE)
c277df42
IZ
1125 goto nogo;
1126 /* Now we know that nxt2 is the only contents: */
1127 oscan->flags = ARG(nxt);
1128 OP(oscan) = CURLYN;
1129 OP(nxt1) = NOTHING; /* was OPEN. */
1130#ifdef DEBUGGING
1131 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1132 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1133 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1135 OP(nxt + 1) = OPTIMIZED; /* was count. */
1136 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1137#endif
c277df42 1138 }
c277df42
IZ
1139 nogo:
1140
1141 /* Try optimization CURLYX => CURLYM. */
b81d288d 1142 if ( OP(oscan) == CURLYX && data
c277df42 1143 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1144 && !(data->flags & SF_HAS_EVAL)
1145 && !deltanext ) {
1146 /* XXXX How to optimize if data == 0? */
1147 /* Optimize to a simpler form. */
1148 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1149 regnode *nxt2;
1150
1151 OP(oscan) = CURLYM;
1152 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1153 && (OP(nxt2) != WHILEM))
c277df42
IZ
1154 nxt = nxt2;
1155 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1156 /* Need to optimize away parenths. */
1157 if (data->flags & SF_IN_PAR) {
1158 /* Set the parenth number. */
1159 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1160
b81d288d 1161 if (OP(nxt) != CLOSE)
b45f050a 1162 FAIL("Panic opt close");
c277df42
IZ
1163 oscan->flags = ARG(nxt);
1164 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1165 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1166#ifdef DEBUGGING
1167 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1168 OP(nxt + 1) = OPTIMIZED; /* was count. */
1169 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1170 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1171#endif
c277df42
IZ
1172#if 0
1173 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1174 regnode *nnxt = regnext(nxt1);
b81d288d 1175
c277df42
IZ
1176 if (nnxt == nxt) {
1177 if (reg_off_by_arg[OP(nxt1)])
1178 ARG_SET(nxt1, nxt2 - nxt1);
1179 else if (nxt2 - nxt1 < U16_MAX)
1180 NEXT_OFF(nxt1) = nxt2 - nxt1;
1181 else
1182 OP(nxt) = NOTHING; /* Cannot beautify */
1183 }
1184 nxt1 = nnxt;
1185 }
1186#endif
1187 /* Optimize again: */
b81d288d 1188 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1189 NULL, 0);
a0ed51b3
LW
1190 }
1191 else
c277df42 1192 oscan->flags = 0;
c277df42 1193 }
e1901655
IZ
1194 else if ((OP(oscan) == CURLYX)
1195 && (flags & SCF_WHILEM_VISITED_POS)
1196 /* See the comment on a similar expression above.
1197 However, this time it not a subexpression
1198 we care about, but the expression itself. */
1199 && (maxcount == REG_INFTY)
1200 && data && ++data->whilem_c < 16) {
1201 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1202 /* Find WHILEM (as in regexec.c) */
1203 regnode *nxt = oscan + NEXT_OFF(oscan);
1204
1205 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1206 nxt += ARG(nxt);
1207 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1208 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1209 }
b81d288d 1210 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1211 pars++;
1212 if (flags & SCF_DO_SUBSTR) {
1213 SV *last_str = Nullsv;
1214 int counted = mincount != 0;
1215
1216 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1217#if defined(SPARC64_GCC_WORKAROUND)
1218 I32 b = 0;
1219 STRLEN l = 0;
1220 char *s = NULL;
1221 I32 old = 0;
1222
1223 if (pos_before >= data->last_start_min)
1224 b = pos_before;
1225 else
1226 b = data->last_start_min;
1227
1228 l = 0;
1229 s = SvPV(data->last_found, l);
1230 old = b - data->last_start_min;
1231
1232#else
b81d288d 1233 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1234 ? pos_before : data->last_start_min;
1235 STRLEN l;
1236 char *s = SvPV(data->last_found, l);
a0ed51b3 1237 I32 old = b - data->last_start_min;
5d1c421c 1238#endif
a0ed51b3
LW
1239
1240 if (UTF)
1241 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1242
a0ed51b3 1243 l -= old;
c277df42 1244 /* Get the added string: */
79cb57f6 1245 last_str = newSVpvn(s + old, l);
c277df42
IZ
1246 if (deltanext == 0 && pos_before == b) {
1247 /* What was added is a constant string */
1248 if (mincount > 1) {
1249 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1250 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1251 SvPVX(last_str), l, mincount - 1);
1252 SvCUR(last_str) *= mincount;
1253 /* Add additional parts. */
b81d288d 1254 SvCUR_set(data->last_found,
c277df42
IZ
1255 SvCUR(data->last_found) - l);
1256 sv_catsv(data->last_found, last_str);
1257 data->last_end += l * (mincount - 1);
1258 }
2a8d9689
HS
1259 } else {
1260 /* start offset must point into the last copy */
1261 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1262 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1263 * (minnext + data->pos_delta);
c277df42
IZ
1264 }
1265 }
1266 /* It is counted once already... */
1267 data->pos_min += minnext * (mincount - counted);
1268 data->pos_delta += - counted * deltanext +
1269 (minnext + deltanext) * maxcount - minnext * mincount;
1270 if (mincount != maxcount) {
653099ff
GS
1271 /* Cannot extend fixed substrings found inside
1272 the group. */
830247a4 1273 scan_commit(pRExC_state,data);
c277df42
IZ
1274 if (mincount && last_str) {
1275 sv_setsv(data->last_found, last_str);
1276 data->last_end = data->pos_min;
b81d288d 1277 data->last_start_min =
a0ed51b3 1278 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1279 data->last_start_max = is_inf
1280 ? I32_MAX
c277df42 1281 : data->pos_min + data->pos_delta
a0ed51b3 1282 - CHR_SVLEN(last_str);
c277df42
IZ
1283 }
1284 data->longest = &(data->longest_float);
1285 }
aca2d497 1286 SvREFCNT_dec(last_str);
c277df42 1287 }
405ff068 1288 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1289 data->flags |= SF_HAS_EVAL;
1290 optimize_curly_tail:
c277df42 1291 if (OP(oscan) != CURLYX) {
22c35a8c 1292 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1293 && NEXT_OFF(next))
1294 NEXT_OFF(oscan) += NEXT_OFF(next);
1295 }
c277df42 1296 continue;
653099ff 1297 default: /* REF and CLUMP only? */
c277df42 1298 if (flags & SCF_DO_SUBSTR) {
830247a4 1299 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1300 data->longest = &(data->longest_float);
1301 }
aca2d497 1302 is_inf = is_inf_internal = 1;
653099ff 1303 if (flags & SCF_DO_STCLASS_OR)
830247a4 1304 cl_anything(pRExC_state, data->start_class);
653099ff 1305 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1306 break;
1307 }
a0ed51b3 1308 }
ffc61ed2 1309 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1310 int value = 0;
653099ff 1311
c277df42 1312 if (flags & SCF_DO_SUBSTR) {
830247a4 1313 scan_commit(pRExC_state,data);
c277df42
IZ
1314 data->pos_min++;
1315 }
1316 min++;
653099ff
GS
1317 if (flags & SCF_DO_STCLASS) {
1318 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1319
1320 /* Some of the logic below assumes that switching
1321 locale on will only add false positives. */
1322 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1323 case SANY:
653099ff
GS
1324 default:
1325 do_default:
1326 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1327 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1328 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1329 break;
1330 case REG_ANY:
1331 if (OP(scan) == SANY)
1332 goto do_default;
1333 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1334 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1335 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1336 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1337 }
1338 if (flags & SCF_DO_STCLASS_AND || !value)
1339 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1340 break;
1341 case ANYOF:
1342 if (flags & SCF_DO_STCLASS_AND)
1343 cl_and(data->start_class,
1344 (struct regnode_charclass_class*)scan);
1345 else
830247a4 1346 cl_or(pRExC_state, data->start_class,
653099ff
GS
1347 (struct regnode_charclass_class*)scan);
1348 break;
1349 case ALNUM:
1350 if (flags & SCF_DO_STCLASS_AND) {
1351 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1352 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1353 for (value = 0; value < 256; value++)
1354 if (!isALNUM(value))
1355 ANYOF_BITMAP_CLEAR(data->start_class, value);
1356 }
1357 }
1358 else {
1359 if (data->start_class->flags & ANYOF_LOCALE)
1360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1361 else {
1362 for (value = 0; value < 256; value++)
1363 if (isALNUM(value))
b81d288d 1364 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1365 }
1366 }
1367 break;
1368 case ALNUML:
1369 if (flags & SCF_DO_STCLASS_AND) {
1370 if (data->start_class->flags & ANYOF_LOCALE)
1371 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1372 }
1373 else {
1374 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1375 data->start_class->flags |= ANYOF_LOCALE;
1376 }
1377 break;
1378 case NALNUM:
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1382 for (value = 0; value < 256; value++)
1383 if (isALNUM(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1385 }
1386 }
1387 else {
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1390 else {
1391 for (value = 0; value < 256; value++)
1392 if (!isALNUM(value))
b81d288d 1393 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1394 }
1395 }
1396 break;
1397 case NALNUML:
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1401 }
1402 else {
1403 data->start_class->flags |= ANYOF_LOCALE;
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1405 }
1406 break;
1407 case SPACE:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1411 for (value = 0; value < 256; value++)
1412 if (!isSPACE(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (isSPACE(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case SPACEL:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1430 }
1431 else {
1432 data->start_class->flags |= ANYOF_LOCALE;
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1434 }
1435 break;
1436 case NSPACE:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1440 for (value = 0; value < 256; value++)
1441 if (isSPACE(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (!isSPACE(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 case NSPACEL:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE) {
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1459 for (value = 0; value < 256; value++)
1460 if (!isSPACE(value))
1461 ANYOF_BITMAP_CLEAR(data->start_class, value);
1462 }
1463 }
1464 else {
1465 data->start_class->flags |= ANYOF_LOCALE;
1466 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1467 }
1468 break;
1469 case DIGIT:
1470 if (flags & SCF_DO_STCLASS_AND) {
1471 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1472 for (value = 0; value < 256; value++)
1473 if (!isDIGIT(value))
1474 ANYOF_BITMAP_CLEAR(data->start_class, value);
1475 }
1476 else {
1477 if (data->start_class->flags & ANYOF_LOCALE)
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1479 else {
1480 for (value = 0; value < 256; value++)
1481 if (isDIGIT(value))
b81d288d 1482 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1483 }
1484 }
1485 break;
1486 case NDIGIT:
1487 if (flags & SCF_DO_STCLASS_AND) {
1488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1489 for (value = 0; value < 256; value++)
1490 if (isDIGIT(value))
1491 ANYOF_BITMAP_CLEAR(data->start_class, value);
1492 }
1493 else {
1494 if (data->start_class->flags & ANYOF_LOCALE)
1495 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1496 else {
1497 for (value = 0; value < 256; value++)
1498 if (!isDIGIT(value))
b81d288d 1499 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1500 }
1501 }
1502 break;
1503 }
1504 if (flags & SCF_DO_STCLASS_OR)
1505 cl_and(data->start_class, &and_with);
1506 flags &= ~SCF_DO_STCLASS;
1507 }
a0ed51b3 1508 }
22c35a8c 1509 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1510 data->flags |= (OP(scan) == MEOL
1511 ? SF_BEFORE_MEOL
1512 : SF_BEFORE_SEOL);
a0ed51b3 1513 }
653099ff
GS
1514 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1515 /* Lookbehind, or need to calculate parens/evals/stclass: */
1516 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1517 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1518 /* Lookahead/lookbehind */
cb434fcc 1519 I32 deltanext, minnext, fake = 0;
c277df42 1520 regnode *nscan;
653099ff
GS
1521 struct regnode_charclass_class intrnl;
1522 int f = 0;
c277df42
IZ
1523
1524 data_fake.flags = 0;
b81d288d 1525 if (data) {
2c2d71f5 1526 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1527 data_fake.last_closep = data->last_closep;
1528 }
1529 else
1530 data_fake.last_closep = &fake;
653099ff
GS
1531 if ( flags & SCF_DO_STCLASS && !scan->flags
1532 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1533 cl_init(pRExC_state, &intrnl);
653099ff 1534 data_fake.start_class = &intrnl;
e1901655 1535 f |= SCF_DO_STCLASS_AND;
653099ff 1536 }
e1901655
IZ
1537 if (flags & SCF_WHILEM_VISITED_POS)
1538 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1539 next = regnext(scan);
1540 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1541 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1542 if (scan->flags) {
1543 if (deltanext) {
9baa0206 1544 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1545 }
1546 else if (minnext > U8_MAX) {
9baa0206 1547 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1548 }
1549 scan->flags = minnext;
1550 }
1551 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1552 pars++;
405ff068 1553 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1554 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1555 if (data)
1556 data->whilem_c = data_fake.whilem_c;
e1901655 1557 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1558 int was = (data->start_class->flags & ANYOF_EOS);
1559
1560 cl_and(data->start_class, &intrnl);
1561 if (was)
1562 data->start_class->flags |= ANYOF_EOS;
1563 }
a0ed51b3
LW
1564 }
1565 else if (OP(scan) == OPEN) {
c277df42 1566 pars++;
a0ed51b3 1567 }
cb434fcc
IZ
1568 else if (OP(scan) == CLOSE) {
1569 if (ARG(scan) == is_par) {
1570 next = regnext(scan);
c277df42 1571
cb434fcc
IZ
1572 if ( next && (OP(next) != WHILEM) && next < last)
1573 is_par = 0; /* Disable optimization */
1574 }
1575 if (data)
1576 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1577 }
1578 else if (OP(scan) == EVAL) {
c277df42
IZ
1579 if (data)
1580 data->flags |= SF_HAS_EVAL;
1581 }
96776eda 1582 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1583 if (flags & SCF_DO_SUBSTR) {
830247a4 1584 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1585 data->longest = &(data->longest_float);
1586 }
1587 is_inf = is_inf_internal = 1;
653099ff 1588 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1589 cl_anything(pRExC_state, data->start_class);
96776eda 1590 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1591 }
c277df42
IZ
1592 /* Else: zero-length, ignore. */
1593 scan = regnext(scan);
1594 }
1595
1596 finish:
1597 *scanp = scan;
aca2d497 1598 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1599 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1600 data->pos_delta = I32_MAX - data->pos_min;
1601 if (is_par > U8_MAX)
1602 is_par = 0;
1603 if (is_par && pars==1 && data) {
1604 data->flags |= SF_IN_PAR;
1605 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1606 }
1607 else if (pars && data) {
c277df42
IZ
1608 data->flags |= SF_HAS_PAR;
1609 data->flags &= ~SF_IN_PAR;
1610 }
653099ff
GS
1611 if (flags & SCF_DO_STCLASS_OR)
1612 cl_and(data->start_class, &and_with);
c277df42
IZ
1613 return min;
1614}
1615
76e3520e 1616STATIC I32
830247a4 1617S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1618{
830247a4 1619 if (RExC_rx->data) {
b81d288d
AB
1620 Renewc(RExC_rx->data,
1621 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1622 char, struct reg_data);
830247a4
IZ
1623 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1624 RExC_rx->data->count += n;
a0ed51b3
LW
1625 }
1626 else {
830247a4 1627 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1628 char, struct reg_data);
830247a4
IZ
1629 New(1208, RExC_rx->data->what, n, U8);
1630 RExC_rx->data->count = n;
c277df42 1631 }
830247a4
IZ
1632 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1633 return RExC_rx->data->count - n;
c277df42
IZ
1634}
1635
d88dccdf 1636void
864dbfa3 1637Perl_reginitcolors(pTHX)
d88dccdf 1638{
d88dccdf
IZ
1639 int i = 0;
1640 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1641
d88dccdf
IZ
1642 if (s) {
1643 PL_colors[0] = s = savepv(s);
1644 while (++i < 6) {
1645 s = strchr(s, '\t');
1646 if (s) {
1647 *s = '\0';
1648 PL_colors[i] = ++s;
1649 }
1650 else
c712d376 1651 PL_colors[i] = s = "";
d88dccdf
IZ
1652 }
1653 } else {
b81d288d 1654 while (i < 6)
d88dccdf
IZ
1655 PL_colors[i++] = "";
1656 }
1657 PL_colorset = 1;
1658}
1659
8615cb43 1660
a687059c 1661/*
e50aee73 1662 - pregcomp - compile a regular expression into internal code
a687059c
LW
1663 *
1664 * We can't allocate space until we know how big the compiled form will be,
1665 * but we can't compile it (and thus know how big it is) until we've got a
1666 * place to put the code. So we cheat: we compile it twice, once with code
1667 * generation turned off and size counting turned on, and once "for real".
1668 * This also means that we don't allocate space until we are sure that the
1669 * thing really will compile successfully, and we never have to move the
1670 * code and thus invalidate pointers into it. (Note that it has to be in
1671 * one piece because free() must be able to free it all.) [NB: not true in perl]
1672 *
1673 * Beware that the optimization-preparation code in here knows about some
1674 * of the structure of the compiled regexp. [I'll say.]
1675 */
1676regexp *
864dbfa3 1677Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1678{
a0d0e21e 1679 register regexp *r;
c277df42 1680 regnode *scan;
c277df42 1681 regnode *first;
a0d0e21e 1682 I32 flags;
a0d0e21e
LW
1683 I32 minlen = 0;
1684 I32 sawplus = 0;
1685 I32 sawopen = 0;
2c2d71f5 1686 scan_data_t data;
830247a4
IZ
1687 RExC_state_t RExC_state;
1688 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1689
1690 if (exp == NULL)
c277df42 1691 FAIL("NULL regexp argument");
a0d0e21e 1692
a5961de5 1693 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1694
5cfc7842 1695 RExC_precomp = exp;
a5961de5
JH
1696 DEBUG_r({
1697 if (!PL_colorset) reginitcolors();
1698 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1699 PL_colors[4],PL_colors[5],PL_colors[0],
1700 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1701 });
830247a4
IZ
1702 RExC_flags16 = pm->op_pmflags;
1703 RExC_sawback = 0;
bbce6d69 1704
830247a4
IZ
1705 RExC_seen = 0;
1706 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1707 RExC_seen_evals = 0;
1708 RExC_extralen = 0;
c277df42 1709
bbce6d69 1710 /* First pass: determine size, legality. */
830247a4 1711 RExC_parse = exp;
fac92740 1712 RExC_start = exp;
830247a4
IZ
1713 RExC_end = xend;
1714 RExC_naughty = 0;
1715 RExC_npar = 1;
1716 RExC_size = 0L;
1717 RExC_emit = &PL_regdummy;
1718 RExC_whilem_seen = 0;
85ddcde9
JH
1719#if 0 /* REGC() is (currently) a NOP at the first pass.
1720 * Clever compilers notice this and complain. --jhi */
830247a4 1721 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1722#endif
830247a4 1723 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1724 RExC_precomp = Nullch;
a0d0e21e
LW
1725 return(NULL);
1726 }
830247a4 1727 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1728
c277df42
IZ
1729 /* Small enough for pointer-storage convention?
1730 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1731 if (RExC_size >= 0x10000L && RExC_extralen)
1732 RExC_size += RExC_extralen;
c277df42 1733 else
830247a4
IZ
1734 RExC_extralen = 0;
1735 if (RExC_whilem_seen > 15)
1736 RExC_whilem_seen = 15;
a0d0e21e 1737
bbce6d69 1738 /* Allocate space and initialize. */
830247a4 1739 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1740 char, regexp);
a0d0e21e 1741 if (r == NULL)
b45f050a
JF
1742 FAIL("Regexp out of space");
1743
0f79a09d
GS
1744#ifdef DEBUGGING
1745 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1746 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1747#endif
c277df42 1748 r->refcnt = 1;
bbce6d69 1749 r->prelen = xend - exp;
5cfc7842 1750 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1751 r->subbeg = NULL;
1752 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1753 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1754
1755 r->substrs = 0; /* Useful during FAIL. */
1756 r->startp = 0; /* Useful during FAIL. */
1757 r->endp = 0; /* Useful during FAIL. */
1758
fac92740
MJD
1759 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1760 if (r->offsets) {
1761 r->offsets[0] = RExC_size;
1762 }
1763 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1764 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1765 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1766 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1767
830247a4 1768 RExC_rx = r;
bbce6d69 1769
1770 /* Second pass: emit code. */
055bb491 1771 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1772 RExC_parse = exp;
1773 RExC_end = xend;
1774 RExC_naughty = 0;
1775 RExC_npar = 1;
fac92740 1776 RExC_emit_start = r->program;
830247a4 1777 RExC_emit = r->program;
2cd61cdb 1778 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1779 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1780 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1781 r->data = 0;
830247a4 1782 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1783 return(NULL);
1784
1785 /* Dig out information for optimizations. */
cf93c79d 1786 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1787 pm->op_pmflags = RExC_flags16;
a0ed51b3 1788 if (UTF)
5ff6fc6d 1789 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1790 r->regstclass = NULL;
830247a4 1791 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1792 r->reganch |= ROPT_NAUGHTY;
c277df42 1793 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1794
1795 /* XXXX To minimize changes to RE engine we always allocate
1796 3-units-long substrs field. */
1797 Newz(1004, r->substrs, 1, struct reg_substr_data);
1798
2c2d71f5 1799 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1800 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1801 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1802 I32 fake;
c5254dd6 1803 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1804 struct regnode_charclass_class ch_class;
1805 int stclass_flag;
cb434fcc 1806 I32 last_close = 0;
a0d0e21e
LW
1807
1808 first = scan;
c277df42 1809 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1810 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1811 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1812 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1813 (OP(first) == PLUS) ||
1814 (OP(first) == MINMOD) ||
653099ff 1815 /* An {n,m} with n>0 */
22c35a8c 1816 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1817 if (OP(first) == PLUS)
1818 sawplus = 1;
1819 else
1820 first += regarglen[(U8)OP(first)];
1821 first = NEXTOPER(first);
a687059c
LW
1822 }
1823
a0d0e21e
LW
1824 /* Starting-point info. */
1825 again:
653099ff 1826 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1827 if (OP(first) == EXACT)
1828 ; /* Empty, get anchored substr later. */
1829 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1830 r->regstclass = first;
1831 }
653099ff 1832 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1833 r->regstclass = first;
22c35a8c
GS
1834 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1835 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1836 r->regstclass = first;
22c35a8c 1837 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1838 r->reganch |= (OP(first) == MBOL
1839 ? ROPT_ANCH_MBOL
1840 : (OP(first) == SBOL
1841 ? ROPT_ANCH_SBOL
1842 : ROPT_ANCH_BOL));
a0d0e21e 1843 first = NEXTOPER(first);
774d564b 1844 goto again;
1845 }
1846 else if (OP(first) == GPOS) {
1847 r->reganch |= ROPT_ANCH_GPOS;
1848 first = NEXTOPER(first);
1849 goto again;
a0d0e21e
LW
1850 }
1851 else if ((OP(first) == STAR &&
22c35a8c 1852 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1853 !(r->reganch & ROPT_ANCH) )
1854 {
1855 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1856 int type = OP(NEXTOPER(first));
1857
ffc61ed2 1858 if (type == REG_ANY)
cad2e5aa
JH
1859 type = ROPT_ANCH_MBOL;
1860 else
1861 type = ROPT_ANCH_SBOL;
1862
1863 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1864 first = NEXTOPER(first);
774d564b 1865 goto again;
a0d0e21e 1866 }
b81d288d 1867 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1868 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1869 /* x+ must match at the 1st pos of run of x's */
1870 r->reganch |= ROPT_SKIP;
a0d0e21e 1871
c277df42 1872 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1873 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1874 (IV)(first - scan + 1)));
a0d0e21e
LW
1875 /*
1876 * If there's something expensive in the r.e., find the
1877 * longest literal string that must appear and make it the
1878 * regmust. Resolve ties in favor of later strings, since
1879 * the regstart check works with the beginning of the r.e.
1880 * and avoiding duplication strengthens checking. Not a
1881 * strong reason, but sufficient in the absence of others.
1882 * [Now we resolve ties in favor of the earlier string if
c277df42 1883 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1884 * earlier string may buy us something the later one won't.]
1885 */
a0d0e21e 1886 minlen = 0;
a687059c 1887
79cb57f6
GS
1888 data.longest_fixed = newSVpvn("",0);
1889 data.longest_float = newSVpvn("",0);
1890 data.last_found = newSVpvn("",0);
c277df42
IZ
1891 data.longest = &(data.longest_fixed);
1892 first = scan;
653099ff 1893 if (!r->regstclass) {
830247a4 1894 cl_init(pRExC_state, &ch_class);
653099ff
GS
1895 data.start_class = &ch_class;
1896 stclass_flag = SCF_DO_STCLASS_AND;
1897 } else /* XXXX Check for BOUND? */
1898 stclass_flag = 0;
cb434fcc 1899 data.last_closep = &last_close;
653099ff 1900
830247a4 1901 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1902 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1903 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1904 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1905 && !RExC_seen_zerolen
1906 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1907 r->reganch |= ROPT_CHECK_ALL;
830247a4 1908 scan_commit(pRExC_state, &data);
c277df42
IZ
1909 SvREFCNT_dec(data.last_found);
1910
a0ed51b3 1911 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1912 if (longest_float_length
c277df42
IZ
1913 || (data.flags & SF_FL_BEFORE_EOL
1914 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1915 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1916 int t;
1917
a0ed51b3 1918 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1919 && data.offset_fixed == data.offset_float_min
1920 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1921 goto remove_float; /* As in (a)+. */
1922
c277df42
IZ
1923 r->float_substr = data.longest_float;
1924 r->float_min_offset = data.offset_float_min;
1925 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1926 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1927 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1928 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1929 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1930 }
1931 else {
aca2d497 1932 remove_float:
c277df42
IZ
1933 r->float_substr = Nullsv;
1934 SvREFCNT_dec(data.longest_float);
c5254dd6 1935 longest_float_length = 0;
a0d0e21e 1936 }
c277df42 1937
a0ed51b3 1938 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1939 if (longest_fixed_length
c277df42
IZ
1940 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1941 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1942 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1943 int t;
1944
c277df42
IZ
1945 r->anchored_substr = data.longest_fixed;
1946 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1947 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1948 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1949 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1950 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1951 }
1952 else {
c277df42
IZ
1953 r->anchored_substr = Nullsv;
1954 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1955 longest_fixed_length = 0;
a0d0e21e 1956 }
b81d288d 1957 if (r->regstclass
ffc61ed2 1958 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1959 r->regstclass = NULL;
1960 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1961 && !(data.start_class->flags & ANYOF_EOS)
1962 && !cl_is_anything(data.start_class)) {
830247a4 1963 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1964
b81d288d 1965 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1966 struct regnode_charclass_class);
1967 StructCopy(data.start_class,
830247a4 1968 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1969 struct regnode_charclass_class);
830247a4 1970 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1971 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1972 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1973 DEBUG_r({ SV *sv = sv_newmortal();
1974 regprop(sv, (regnode*)data.start_class);
1975 PerlIO_printf(Perl_debug_log,
1976 "synthetic stclass `%s'.\n",
1977 SvPVX(sv));});
653099ff 1978 }
c277df42
IZ
1979
1980 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1981 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1982 r->check_substr = r->anchored_substr;
1983 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1984 if (r->reganch & ROPT_ANCH_SINGLE)
1985 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1986 }
1987 else {
c277df42
IZ
1988 r->check_substr = r->float_substr;
1989 r->check_offset_min = data.offset_float_min;
1990 r->check_offset_max = data.offset_float_max;
a0d0e21e 1991 }
30382c73
IZ
1992 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1993 This should be changed ASAP! */
1994 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1995 r->reganch |= RE_USE_INTUIT;
1996 if (SvTAIL(r->check_substr))
1997 r->reganch |= RE_INTUIT_TAIL;
1998 }
a0ed51b3
LW
1999 }
2000 else {
c277df42
IZ
2001 /* Several toplevels. Best we can is to set minlen. */
2002 I32 fake;
653099ff 2003 struct regnode_charclass_class ch_class;
cb434fcc 2004 I32 last_close = 0;
c277df42
IZ
2005
2006 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2007 scan = r->program + 1;
830247a4 2008 cl_init(pRExC_state, &ch_class);
653099ff 2009 data.start_class = &ch_class;
cb434fcc 2010 data.last_closep = &last_close;
e1901655 2011 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 2012 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
2013 if (!(data.start_class->flags & ANYOF_EOS)
2014 && !cl_is_anything(data.start_class)) {
830247a4 2015 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2016
b81d288d 2017 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2018 struct regnode_charclass_class);
2019 StructCopy(data.start_class,
830247a4 2020 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2021 struct regnode_charclass_class);
830247a4 2022 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2023 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2024 DEBUG_r({ SV* sv = sv_newmortal();
2025 regprop(sv, (regnode*)data.start_class);
2026 PerlIO_printf(Perl_debug_log,
2027 "synthetic stclass `%s'.\n",
2028 SvPVX(sv));});
653099ff 2029 }
a0d0e21e
LW
2030 }
2031
a0d0e21e 2032 r->minlen = minlen;
b81d288d 2033 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2034 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2035 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2036 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2037 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2038 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2039 if (RExC_seen & REG_SEEN_CANY)
2040 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2041 Newz(1002, r->startp, RExC_npar, I32);
2042 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2043 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2044 DEBUG_r(regdump(r));
2045 return(r);
a687059c
LW
2046}
2047
2048/*
2049 - reg - regular expression, i.e. main body or parenthesized thing
2050 *
2051 * Caller must absorb opening parenthesis.
2052 *
2053 * Combining parenthesis handling with the base level of regular expression
2054 * is a trifle forced, but the need to tie the tails of the branches to what
2055 * follows makes it hard to avoid.
2056 */
76e3520e 2057STATIC regnode *
830247a4 2058S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2059 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2060{
c277df42
IZ
2061 register regnode *ret; /* Will be the head of the group. */
2062 register regnode *br;
2063 register regnode *lastbr;
2064 register regnode *ender = 0;
a0d0e21e 2065 register I32 parno = 0;
830247a4 2066 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2067
2068 /* for (?g), (?gc), and (?o) warnings; warning
2069 about (?c) will warn about (?g) -- japhy */
2070
2071 I32 wastedflags = 0x00,
2072 wasted_o = 0x01,
2073 wasted_g = 0x02,
2074 wasted_gc = 0x02 | 0x04,
2075 wasted_c = 0x04;
2076
fac92740 2077 char * parse_start = RExC_parse; /* MJD */
830247a4 2078 char *oregcomp_parse = RExC_parse;
c277df42 2079 char c;
a0d0e21e 2080
821b33a5 2081 *flagp = 0; /* Tentatively. */
a0d0e21e 2082
9d1d55b5 2083
a0d0e21e
LW
2084 /* Make an OPEN node, if parenthesized. */
2085 if (paren) {
fac92740 2086 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2087 U16 posflags = 0, negflags = 0;
2088 U16 *flagsp = &posflags;
0f5d15d6 2089 int logical = 0;
830247a4 2090 char *seqstart = RExC_parse;
ca9dfc88 2091
830247a4
IZ
2092 RExC_parse++;
2093 paren = *RExC_parse++;
c277df42 2094 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2095 switch (paren) {
fac92740 2096 case '<': /* (?<...) */
830247a4 2097 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2098 if (*RExC_parse == '!')
c277df42 2099 paren = ',';
b81d288d 2100 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2101 goto unknown;
830247a4 2102 RExC_parse++;
fac92740
MJD
2103 case '=': /* (?=...) */
2104 case '!': /* (?!...) */
830247a4 2105 RExC_seen_zerolen++;
fac92740
MJD
2106 case ':': /* (?:...) */
2107 case '>': /* (?>...) */
a0d0e21e 2108 break;
fac92740
MJD
2109 case '$': /* (?$...) */
2110 case '@': /* (?@...) */
8615cb43 2111 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2112 break;
fac92740 2113 case '#': /* (?#...) */
830247a4
IZ
2114 while (*RExC_parse && *RExC_parse != ')')
2115 RExC_parse++;
2116 if (*RExC_parse != ')')
c277df42 2117 FAIL("Sequence (?#... not terminated");
830247a4 2118 nextchar(pRExC_state);
a0d0e21e
LW
2119 *flagp = TRYAGAIN;
2120 return NULL;
fac92740 2121 case 'p': /* (?p...) */
f9373011
PM
2122 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2123 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2124 /* FALL THROUGH*/
fac92740 2125 case '?': /* (??...) */
0f5d15d6 2126 logical = 1;
830247a4 2127 paren = *RExC_parse++;
0f5d15d6 2128 /* FALL THROUGH */
fac92740 2129 case '{': /* (?{...}) */
c277df42 2130 {
c277df42
IZ
2131 I32 count = 1, n = 0;
2132 char c;
830247a4 2133 char *s = RExC_parse;
c277df42
IZ
2134 SV *sv;
2135 OP_4tree *sop, *rop;
2136
830247a4
IZ
2137 RExC_seen_zerolen++;
2138 RExC_seen |= REG_SEEN_EVAL;
2139 while (count && (c = *RExC_parse)) {
2140 if (c == '\\' && RExC_parse[1])
2141 RExC_parse++;
b81d288d 2142 else if (c == '{')
c277df42 2143 count++;
b81d288d 2144 else if (c == '}')
c277df42 2145 count--;
830247a4 2146 RExC_parse++;
c277df42 2147 }
830247a4 2148 if (*RExC_parse != ')')
b45f050a 2149 {
b81d288d 2150 RExC_parse = s;
b45f050a
JF
2151 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2152 }
c277df42
IZ
2153 if (!SIZE_ONLY) {
2154 AV *av;
b81d288d
AB
2155
2156 if (RExC_parse - 1 - s)
830247a4 2157 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2158 else
79cb57f6 2159 sv = newSVpvn("", 0);
c277df42 2160
569233ed
SB
2161 ENTER;
2162 Perl_save_re_context(aTHX);
c277df42 2163 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2164 sop->op_private |= OPpREFCOUNTED;
2165 /* re_dup will OpREFCNT_inc */
2166 OpREFCNT_set(sop, 1);
569233ed 2167 LEAVE;
c277df42 2168
830247a4
IZ
2169 n = add_data(pRExC_state, 3, "nop");
2170 RExC_rx->data->data[n] = (void*)rop;
2171 RExC_rx->data->data[n+1] = (void*)sop;
2172 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2173 SvREFCNT_dec(sv);
a0ed51b3 2174 }
e24b16f9 2175 else { /* First pass */
830247a4 2176 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2177 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2178 /* No compiled RE interpolated, has runtime
2179 components ===> unsafe. */
2180 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2181 if (PL_tainting && PL_tainted)
cc6b7395 2182 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2183 }
2184
830247a4 2185 nextchar(pRExC_state);
0f5d15d6 2186 if (logical) {
830247a4 2187 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2188 if (!SIZE_ONLY)
2189 ret->flags = 2;
830247a4 2190 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2191 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2192 return ret;
2193 }
830247a4 2194 return reganode(pRExC_state, EVAL, n);
c277df42 2195 }
fac92740 2196 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2197 {
fac92740 2198 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2199 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2200 || RExC_parse[1] == '<'
830247a4 2201 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2202 I32 flag;
2203
830247a4 2204 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2205 if (!SIZE_ONLY)
2206 ret->flags = 1;
830247a4 2207 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2208 goto insert_if;
b81d288d 2209 }
a0ed51b3 2210 }
830247a4 2211 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2212 /* (?(1)...) */
830247a4 2213 parno = atoi(RExC_parse++);
c277df42 2214
830247a4
IZ
2215 while (isDIGIT(*RExC_parse))
2216 RExC_parse++;
fac92740
MJD
2217 ret = reganode(pRExC_state, GROUPP, parno);
2218
830247a4 2219 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2220 vFAIL("Switch condition not recognized");
c277df42 2221 insert_if:
830247a4
IZ
2222 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2223 br = regbranch(pRExC_state, &flags, 1);
c277df42 2224 if (br == NULL)
830247a4 2225 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2226 else
830247a4
IZ
2227 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2228 c = *nextchar(pRExC_state);
d1b80229
IZ
2229 if (flags&HASWIDTH)
2230 *flagp |= HASWIDTH;
c277df42 2231 if (c == '|') {
830247a4
IZ
2232 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2233 regbranch(pRExC_state, &flags, 1);
2234 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2235 if (flags&HASWIDTH)
2236 *flagp |= HASWIDTH;
830247a4 2237 c = *nextchar(pRExC_state);
a0ed51b3
LW
2238 }
2239 else
c277df42
IZ
2240 lastbr = NULL;
2241 if (c != ')')
8615cb43 2242 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2243 ender = reg_node(pRExC_state, TAIL);
2244 regtail(pRExC_state, br, ender);
c277df42 2245 if (lastbr) {
830247a4
IZ
2246 regtail(pRExC_state, lastbr, ender);
2247 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2248 }
2249 else
830247a4 2250 regtail(pRExC_state, ret, ender);
c277df42 2251 return ret;
a0ed51b3
LW
2252 }
2253 else {
830247a4 2254 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2255 }
2256 }
1b1626e4 2257 case 0:
830247a4 2258 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2259 vFAIL("Sequence (? incomplete");
1b1626e4 2260 break;
a0d0e21e 2261 default:
830247a4 2262 --RExC_parse;
fac92740 2263 parse_flags: /* (?i) */
830247a4 2264 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2265 /* (?g), (?gc) and (?o) are useless here
2266 and must be globally applied -- japhy */
2267
2268 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2269 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2270 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2271 if (! (wastedflags & wflagbit) ) {
2272 wastedflags |= wflagbit;
2273 vWARN5(
2274 RExC_parse + 1,
2275 "Useless (%s%c) - %suse /%c modifier",
2276 flagsp == &negflags ? "?-" : "?",
2277 *RExC_parse,
2278 flagsp == &negflags ? "don't " : "",
2279 *RExC_parse
2280 );
2281 }
2282 }
2283 }
2284 else if (*RExC_parse == 'c') {
2285 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2286 if (! (wastedflags & wasted_c) ) {
2287 wastedflags |= wasted_gc;
2288 vWARN3(
2289 RExC_parse + 1,
2290 "Useless (%sc) - %suse /gc modifier",
2291 flagsp == &negflags ? "?-" : "?",
2292 flagsp == &negflags ? "don't " : ""
2293 );
2294 }
2295 }
2296 }
2297 else { pmflag(flagsp, *RExC_parse); }
2298
830247a4 2299 ++RExC_parse;
ca9dfc88 2300 }
830247a4 2301 if (*RExC_parse == '-') {
ca9dfc88 2302 flagsp = &negflags;
9d1d55b5 2303 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2304 ++RExC_parse;
ca9dfc88 2305 goto parse_flags;
48c036b1 2306 }
830247a4
IZ
2307 RExC_flags16 |= posflags;
2308 RExC_flags16 &= ~negflags;
2309 if (*RExC_parse == ':') {
2310 RExC_parse++;
ca9dfc88
IZ
2311 paren = ':';
2312 break;
2313 }
c277df42 2314 unknown:
830247a4
IZ
2315 if (*RExC_parse != ')') {
2316 RExC_parse++;
2317 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2318 }
830247a4 2319 nextchar(pRExC_state);
a0d0e21e
LW
2320 *flagp = TRYAGAIN;
2321 return NULL;
2322 }
2323 }
fac92740 2324 else { /* (...) */
830247a4
IZ
2325 parno = RExC_npar;
2326 RExC_npar++;
2327 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2328 Set_Node_Length(ret, 1); /* MJD */
2329 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2330 open = 1;
a0d0e21e 2331 }
a0ed51b3 2332 }
fac92740 2333 else /* ! paren */
a0d0e21e
LW
2334 ret = NULL;
2335
2336 /* Pick up the branches, linking them together. */
fac92740 2337 parse_start = RExC_parse; /* MJD */
830247a4 2338 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2339 /* branch_len = (paren != 0); */
2340
a0d0e21e
LW
2341 if (br == NULL)
2342 return(NULL);
830247a4
IZ
2343 if (*RExC_parse == '|') {
2344 if (!SIZE_ONLY && RExC_extralen) {
2345 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2346 }
fac92740 2347 else { /* MJD */
830247a4 2348 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2349 Set_Node_Length(br, paren != 0);
2350 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2351 }
c277df42
IZ
2352 have_branch = 1;
2353 if (SIZE_ONLY)
830247a4 2354 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2355 }
2356 else if (paren == ':') {
c277df42
IZ
2357 *flagp |= flags&SIMPLE;
2358 }
2359 if (open) { /* Starts with OPEN. */
830247a4 2360 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2361 }
2362 else if (paren != '?') /* Not Conditional */
a0d0e21e 2363 ret = br;
821b33a5
IZ
2364 if (flags&HASWIDTH)
2365 *flagp |= HASWIDTH;
a0d0e21e 2366 *flagp |= flags&SPSTART;
c277df42 2367 lastbr = br;
830247a4
IZ
2368 while (*RExC_parse == '|') {
2369 if (!SIZE_ONLY && RExC_extralen) {
2370 ender = reganode(pRExC_state, LONGJMP,0);
2371 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2372 }
2373 if (SIZE_ONLY)
830247a4
IZ
2374 RExC_extralen += 2; /* Account for LONGJMP. */
2375 nextchar(pRExC_state);
2376 br = regbranch(pRExC_state, &flags, 0);
fac92740 2377
a687059c 2378 if (br == NULL)
a0d0e21e 2379 return(NULL);
830247a4 2380 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2381 lastbr = br;
821b33a5
IZ
2382 if (flags&HASWIDTH)
2383 *flagp |= HASWIDTH;
a687059c 2384 *flagp |= flags&SPSTART;
a0d0e21e
LW
2385 }
2386
c277df42
IZ
2387 if (have_branch || paren != ':') {
2388 /* Make a closing node, and hook it on the end. */
2389 switch (paren) {
2390 case ':':
830247a4 2391 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2392 break;
2393 case 1:
830247a4 2394 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2395 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2396 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2397 break;
2398 case '<':
c277df42
IZ
2399 case ',':
2400 case '=':
2401 case '!':
c277df42 2402 *flagp &= ~HASWIDTH;
821b33a5
IZ
2403 /* FALL THROUGH */
2404 case '>':
830247a4 2405 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2406 break;
2407 case 0:
830247a4 2408 ender = reg_node(pRExC_state, END);
c277df42
IZ
2409 break;
2410 }
830247a4 2411 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2412
c277df42
IZ
2413 if (have_branch) {
2414 /* Hook the tails of the branches to the closing node. */
2415 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2416 regoptail(pRExC_state, br, ender);
c277df42
IZ
2417 }
2418 }
a0d0e21e 2419 }
c277df42
IZ
2420
2421 {
2422 char *p;
2423 static char parens[] = "=!<,>";
2424
2425 if (paren && (p = strchr(parens, paren))) {
2426 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2427 int flag = (p - parens) > 1;
2428
2429 if (paren == '>')
2430 node = SUSPEND, flag = 0;
830247a4 2431 reginsert(pRExC_state, node,ret);
c277df42 2432 ret->flags = flag;
830247a4 2433 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2434 }
a0d0e21e
LW
2435 }
2436
2437 /* Check for proper termination. */
ce3e6498 2438 if (paren) {
830247a4
IZ
2439 RExC_flags16 = oregflags;
2440 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2441 RExC_parse = oregcomp_parse;
380a0633 2442 vFAIL("Unmatched (");
ce3e6498 2443 }
a0ed51b3 2444 }
830247a4
IZ
2445 else if (!paren && RExC_parse < RExC_end) {
2446 if (*RExC_parse == ')') {
2447 RExC_parse++;
380a0633 2448 vFAIL("Unmatched )");
a0ed51b3
LW
2449 }
2450 else
b45f050a 2451 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2452 /* NOTREACHED */
2453 }
a687059c 2454
a0d0e21e 2455 return(ret);
a687059c
LW
2456}
2457
2458/*
2459 - regbranch - one alternative of an | operator
2460 *
2461 * Implements the concatenation operator.
2462 */
76e3520e 2463STATIC regnode *
830247a4 2464S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2465{
c277df42
IZ
2466 register regnode *ret;
2467 register regnode *chain = NULL;
2468 register regnode *latest;
2469 I32 flags = 0, c = 0;
a0d0e21e 2470
b81d288d 2471 if (first)
c277df42
IZ
2472 ret = NULL;
2473 else {
b81d288d 2474 if (!SIZE_ONLY && RExC_extralen)
830247a4 2475 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2476 else {
830247a4 2477 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2478 Set_Node_Length(ret, 1);
2479 }
c277df42
IZ
2480 }
2481
b81d288d 2482 if (!first && SIZE_ONLY)
830247a4 2483 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2484
c277df42 2485 *flagp = WORST; /* Tentatively. */
a0d0e21e 2486
830247a4
IZ
2487 RExC_parse--;
2488 nextchar(pRExC_state);
2489 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2490 flags &= ~TRYAGAIN;
830247a4 2491 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2492 if (latest == NULL) {
2493 if (flags & TRYAGAIN)
2494 continue;
2495 return(NULL);
a0ed51b3
LW
2496 }
2497 else if (ret == NULL)
c277df42 2498 ret = latest;
a0d0e21e 2499 *flagp |= flags&HASWIDTH;
c277df42 2500 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2501 *flagp |= flags&SPSTART;
2502 else {
830247a4
IZ
2503 RExC_naughty++;
2504 regtail(pRExC_state, chain, latest);
a687059c 2505 }
a0d0e21e 2506 chain = latest;
c277df42
IZ
2507 c++;
2508 }
2509 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2510 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2511 if (ret == NULL)
2512 ret = chain;
2513 }
2514 if (c == 1) {
2515 *flagp |= flags&SIMPLE;
a0d0e21e 2516 }
a687059c 2517
a0d0e21e 2518 return(ret);
a687059c
LW
2519}
2520
2521/*
2522 - regpiece - something followed by possible [*+?]
2523 *
2524 * Note that the branching code sequences used for ? and the general cases
2525 * of * and + are somewhat optimized: they use the same NOTHING node as
2526 * both the endmarker for their branch list and the body of the last branch.
2527 * It might seem that this node could be dispensed with entirely, but the
2528 * endmarker role is not redundant.
2529 */
76e3520e 2530STATIC regnode *
830247a4 2531S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2532{
c277df42 2533 register regnode *ret;
a0d0e21e
LW
2534 register char op;
2535 register char *next;
2536 I32 flags;
830247a4 2537 char *origparse = RExC_parse;
a0d0e21e
LW
2538 char *maxpos;
2539 I32 min;
c277df42 2540 I32 max = REG_INFTY;
fac92740 2541 char *parse_start;
a0d0e21e 2542
830247a4 2543 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2544 if (ret == NULL) {
2545 if (flags & TRYAGAIN)
2546 *flagp |= TRYAGAIN;
2547 return(NULL);
2548 }
2549
830247a4 2550 op = *RExC_parse;
a0d0e21e 2551
830247a4 2552 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2553 parse_start = RExC_parse; /* MJD */
830247a4 2554 next = RExC_parse + 1;
a0d0e21e
LW
2555 maxpos = Nullch;
2556 while (isDIGIT(*next) || *next == ',') {
2557 if (*next == ',') {
2558 if (maxpos)
2559 break;
2560 else
2561 maxpos = next;
a687059c 2562 }
a0d0e21e
LW
2563 next++;
2564 }
2565 if (*next == '}') { /* got one */
2566 if (!maxpos)
2567 maxpos = next;
830247a4
IZ
2568 RExC_parse++;
2569 min = atoi(RExC_parse);
a0d0e21e
LW
2570 if (*maxpos == ',')
2571 maxpos++;
2572 else
830247a4 2573 maxpos = RExC_parse;
a0d0e21e
LW
2574 max = atoi(maxpos);
2575 if (!max && *maxpos != '0')
c277df42
IZ
2576 max = REG_INFTY; /* meaning "infinity" */
2577 else if (max >= REG_INFTY)
8615cb43 2578 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2579 RExC_parse = next;
2580 nextchar(pRExC_state);
a0d0e21e
LW
2581
2582 do_curly:
2583 if ((flags&SIMPLE)) {
830247a4
IZ
2584 RExC_naughty += 2 + RExC_naughty / 2;
2585 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2586 Set_Node_Offset(ret, parse_start+1); /* MJD */
2587 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2588 }
2589 else {
830247a4 2590 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2591
2592 w->flags = 0;
830247a4
IZ
2593 regtail(pRExC_state, ret, w);
2594 if (!SIZE_ONLY && RExC_extralen) {
2595 reginsert(pRExC_state, LONGJMP,ret);
2596 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2597 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2598 }
830247a4 2599 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2600 /* MJD hk */
2601 Set_Node_Offset(ret, parse_start+1);
2602 Set_Node_Length(ret,
2603 op == '{' ? (RExC_parse - parse_start) : 1);
2604
830247a4 2605 if (!SIZE_ONLY && RExC_extralen)
c277df42 2606 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2607 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2608 if (SIZE_ONLY)
830247a4
IZ
2609 RExC_whilem_seen++, RExC_extralen += 3;
2610 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2611 }
c277df42 2612 ret->flags = 0;
a0d0e21e
LW
2613
2614 if (min > 0)
821b33a5
IZ
2615 *flagp = WORST;
2616 if (max > 0)
2617 *flagp |= HASWIDTH;
a0d0e21e 2618 if (max && max < min)
8615cb43 2619 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2620 if (!SIZE_ONLY) {
2621 ARG1_SET(ret, min);
2622 ARG2_SET(ret, max);
a687059c 2623 }
a687059c 2624
a0d0e21e 2625 goto nest_check;
a687059c 2626 }
a0d0e21e 2627 }
a687059c 2628
a0d0e21e
LW
2629 if (!ISMULT1(op)) {
2630 *flagp = flags;
a687059c 2631 return(ret);
a0d0e21e 2632 }
bb20fd44 2633
c277df42 2634#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2635
2636 /* if this is reinstated, don't forget to put this back into perldiag:
2637
2638 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2639
2640 (F) The part of the regexp subject to either the * or + quantifier
2641 could match an empty string. The {#} shows in the regular
2642 expression about where the problem was discovered.
2643
2644 */
2645
bb20fd44 2646 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2647 vFAIL("Regexp *+ operand could be empty");
b81d288d 2648#endif
bb20fd44 2649
fac92740 2650 parse_start = RExC_parse;
830247a4 2651 nextchar(pRExC_state);
a0d0e21e 2652
821b33a5 2653 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2654
2655 if (op == '*' && (flags&SIMPLE)) {
830247a4 2656 reginsert(pRExC_state, STAR, ret);
c277df42 2657 ret->flags = 0;
830247a4 2658 RExC_naughty += 4;
a0d0e21e
LW
2659 }
2660 else if (op == '*') {
2661 min = 0;
2662 goto do_curly;
a0ed51b3
LW
2663 }
2664 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2665 reginsert(pRExC_state, PLUS, ret);
c277df42 2666 ret->flags = 0;
830247a4 2667 RExC_naughty += 3;
a0d0e21e
LW
2668 }
2669 else if (op == '+') {
2670 min = 1;
2671 goto do_curly;
a0ed51b3
LW
2672 }
2673 else if (op == '?') {
a0d0e21e
LW
2674 min = 0; max = 1;
2675 goto do_curly;
2676 }
2677 nest_check:
e476b1b5 2678 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2679 vWARN3(RExC_parse,
b45f050a 2680 "%.*s matches null string many times",
830247a4 2681 RExC_parse - origparse,
b45f050a 2682 origparse);
a0d0e21e
LW
2683 }
2684
830247a4
IZ
2685 if (*RExC_parse == '?') {
2686 nextchar(pRExC_state);
2687 reginsert(pRExC_state, MINMOD, ret);
2688 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2689 }
830247a4
IZ
2690 if (ISMULT2(RExC_parse)) {
2691 RExC_parse++;
b45f050a
JF
2692 vFAIL("Nested quantifiers");
2693 }
a0d0e21e
LW
2694
2695 return(ret);
a687059c
LW
2696}
2697
2698/*
2699 - regatom - the lowest level
2700 *
2701 * Optimization: gobbles an entire sequence of ordinary characters so that
2702 * it can turn them into a single node, which is smaller to store and
2703 * faster to run. Backslashed characters are exceptions, each becoming a
2704 * separate node; the code is simpler that way and it's not worth fixing.
2705 *
b45f050a 2706 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2707STATIC regnode *
830247a4 2708S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2709{
c277df42 2710 register regnode *ret = 0;
a0d0e21e 2711 I32 flags;
f06dbbb7 2712 char *parse_start = 0;
a0d0e21e
LW
2713
2714 *flagp = WORST; /* Tentatively. */
2715
2716tryagain:
830247a4 2717 switch (*RExC_parse) {
a0d0e21e 2718 case '^':
830247a4
IZ
2719 RExC_seen_zerolen++;
2720 nextchar(pRExC_state);
2721 if (RExC_flags16 & PMf_MULTILINE)
2722 ret = reg_node(pRExC_state, MBOL);
2723 else if (RExC_flags16 & PMf_SINGLELINE)
2724 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2725 else
830247a4 2726 ret = reg_node(pRExC_state, BOL);
fac92740 2727 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2728 break;
2729 case '$':
830247a4 2730 nextchar(pRExC_state);
b81d288d 2731 if (*RExC_parse)
830247a4
IZ
2732 RExC_seen_zerolen++;
2733 if (RExC_flags16 & PMf_MULTILINE)
2734 ret = reg_node(pRExC_state, MEOL);
2735 else if (RExC_flags16 & PMf_SINGLELINE)
2736 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2737 else
830247a4 2738 ret = reg_node(pRExC_state, EOL);
fac92740 2739 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2740 break;
2741 case '.':
830247a4 2742 nextchar(pRExC_state);
cce850e4 2743 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2744 ret = reg_node(pRExC_state, SANY);
2745 else
2746 ret = reg_node(pRExC_state, REG_ANY);
2747 *flagp |= HASWIDTH|SIMPLE;
830247a4 2748 RExC_naughty++;
fac92740 2749 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2750 break;
2751 case '[':
b45f050a 2752 {
830247a4 2753 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2754 ret = regclass(pRExC_state);
830247a4
IZ
2755 if (*RExC_parse != ']') {
2756 RExC_parse = oregcomp_parse;
b45f050a
JF
2757 vFAIL("Unmatched [");
2758 }
830247a4 2759 nextchar(pRExC_state);
a0d0e21e 2760 *flagp |= HASWIDTH|SIMPLE;
fac92740 2761 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2762 break;
b45f050a 2763 }
a0d0e21e 2764 case '(':
830247a4
IZ
2765 nextchar(pRExC_state);
2766 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2767 if (ret == NULL) {
bf93d4cc 2768 if (flags & TRYAGAIN) {
830247a4 2769 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2770 /* Make parent create an empty node if needed. */
2771 *flagp |= TRYAGAIN;
2772 return(NULL);
2773 }
a0d0e21e 2774 goto tryagain;
bf93d4cc 2775 }
a0d0e21e
LW
2776 return(NULL);
2777 }
c277df42 2778 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2779 break;
2780 case '|':
2781 case ')':
2782 if (flags & TRYAGAIN) {
2783 *flagp |= TRYAGAIN;
2784 return NULL;
2785 }
b45f050a 2786 vFAIL("Internal urp");
a0d0e21e
LW
2787 /* Supposed to be caught earlier. */
2788 break;
85afd4ae 2789 case '{':
830247a4
IZ
2790 if (!regcurly(RExC_parse)) {
2791 RExC_parse++;
85afd4ae
CS
2792 goto defchar;
2793 }
2794 /* FALL THROUGH */
a0d0e21e
LW
2795 case '?':
2796 case '+':
2797 case '*':
830247a4 2798 RExC_parse++;
b45f050a 2799 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2800 break;
2801 case '\\':
830247a4 2802 switch (*++RExC_parse) {
a0d0e21e 2803 case 'A':
830247a4
IZ
2804 RExC_seen_zerolen++;
2805 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2806 *flagp |= SIMPLE;
830247a4 2807 nextchar(pRExC_state);
fac92740 2808 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2809 break;
2810 case 'G':
830247a4
IZ
2811 ret = reg_node(pRExC_state, GPOS);
2812 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2813 *flagp |= SIMPLE;
830247a4 2814 nextchar(pRExC_state);
fac92740 2815 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2816 break;
2817 case 'Z':
830247a4 2818 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2819 *flagp |= SIMPLE;
a1917ab9 2820 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2821 nextchar(pRExC_state);
a0d0e21e 2822 break;
b85d18e9 2823 case 'z':
830247a4 2824 ret = reg_node(pRExC_state, EOS);
b85d18e9 2825 *flagp |= SIMPLE;
830247a4
IZ
2826 RExC_seen_zerolen++; /* Do not optimize RE away */
2827 nextchar(pRExC_state);
fac92740 2828 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2829 break;
4a2d328f 2830 case 'C':
f33976b4
DB
2831 ret = reg_node(pRExC_state, CANY);
2832 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2833 *flagp |= HASWIDTH|SIMPLE;
830247a4 2834 nextchar(pRExC_state);
fac92740 2835 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2836 break;
2837 case 'X':
830247a4 2838 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2839 *flagp |= HASWIDTH;
830247a4 2840 nextchar(pRExC_state);
fac92740 2841 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2842 break;
a0d0e21e 2843 case 'w':
ffc61ed2 2844 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2845 *flagp |= HASWIDTH|SIMPLE;
830247a4 2846 nextchar(pRExC_state);
fac92740 2847 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2848 break;
2849 case 'W':
ffc61ed2 2850 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2851 *flagp |= HASWIDTH|SIMPLE;
830247a4 2852 nextchar(pRExC_state);
fac92740 2853 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2854 break;
2855 case 'b':
830247a4
IZ
2856 RExC_seen_zerolen++;
2857 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2858 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2859 *flagp |= SIMPLE;
830247a4 2860 nextchar(pRExC_state);
fac92740 2861 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2862 break;
2863 case 'B':
830247a4
IZ
2864 RExC_seen_zerolen++;
2865 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2866 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2867 *flagp |= SIMPLE;
830247a4 2868 nextchar(pRExC_state);
fac92740 2869 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2870 break;
2871 case 's':
ffc61ed2 2872 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2873 *flagp |= HASWIDTH|SIMPLE;
830247a4 2874 nextchar(pRExC_state);
fac92740 2875 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2876 break;
2877 case 'S':
ffc61ed2 2878 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2879 *flagp |= HASWIDTH|SIMPLE;
830247a4 2880 nextchar(pRExC_state);
fac92740 2881 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2882 break;
2883 case 'd':
ffc61ed2 2884 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2885 *flagp |= HASWIDTH|SIMPLE;
830247a4 2886 nextchar(pRExC_state);
fac92740 2887 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2888 break;
2889 case 'D':
ffc61ed2 2890 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2891 *flagp |= HASWIDTH|SIMPLE;
830247a4 2892 nextchar(pRExC_state);
fac92740 2893 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2894 break;
a14b48bc
LW
2895 case 'p':
2896 case 'P':
3568d838 2897 {
830247a4 2898 char* oldregxend = RExC_end;
fac92740 2899 char* parse_start = RExC_parse;
a14b48bc 2900
830247a4 2901 if (RExC_parse[1] == '{') {
3568d838 2902 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2903 RExC_end = strchr(RExC_parse, '}');
2904 if (!RExC_end) {
0da60cf5 2905 U8 c = (U8)*RExC_parse;
830247a4
IZ
2906 RExC_parse += 2;
2907 RExC_end = oldregxend;
0da60cf5 2908 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2909 }
830247a4 2910 RExC_end++;
a14b48bc
LW
2911 }
2912 else
830247a4
IZ
2913 RExC_end = RExC_parse + 2;
2914 RExC_parse--;
a14b48bc 2915
ffc61ed2 2916 ret = regclass(pRExC_state);
a14b48bc 2917
830247a4
IZ
2918 RExC_end = oldregxend;
2919 RExC_parse--;
fac92740 2920 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2921 nextchar(pRExC_state);
a14b48bc
LW
2922 *flagp |= HASWIDTH|SIMPLE;
2923 }
2924 break;
a0d0e21e
LW
2925 case 'n':
2926 case 'r':
2927 case 't':
2928 case 'f':
2929 case 'e':
2930 case 'a':
2931 case 'x':
2932 case 'c':
2933 case '0':
2934 goto defchar;
2935 case '1': case '2': case '3': case '4':
2936 case '5': case '6': case '7': case '8': case '9':
2937 {
830247a4 2938 I32 num = atoi(RExC_parse);
a0d0e21e 2939
830247a4 2940 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2941 goto defchar;
2942 else {
fac92740 2943 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2944 while (isDIGIT(*RExC_parse))
2945 RExC_parse++;
b45f050a 2946
830247a4 2947 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2948 vFAIL("Reference to nonexistent group");
830247a4
IZ
2949 RExC_sawback = 1;
2950 ret = reganode(pRExC_state, FOLD
a0ed51b3 2951 ? (LOC ? REFFL : REFF)
c8756f30 2952 : REF, num);
a0d0e21e 2953 *flagp |= HASWIDTH;
fac92740
MJD
2954
2955 /* override incorrect value set in reganode MJD */
2956 Set_Node_Offset(ret, parse_start+1);
2957 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2958 RExC_parse--;
2959 nextchar(pRExC_state);
a0d0e21e
LW
2960 }
2961 }
2962 break;
2963 case '\0':
830247a4 2964 if (RExC_parse >= RExC_end)
b45f050a 2965 FAIL("Trailing \\");
a0d0e21e
LW
2966 /* FALL THROUGH */
2967 default:
c9f97d15
IZ
2968 /* Do not generate `unrecognized' warnings here, we fall
2969 back into the quick-grab loop below */
a0d0e21e
LW
2970 goto defchar;
2971 }
2972 break;
4633a7c4
LW
2973
2974 case '#':
830247a4
IZ
2975 if (RExC_flags16 & PMf_EXTENDED) {
2976 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2977 if (RExC_parse < RExC_end)
4633a7c4
LW
2978 goto tryagain;
2979 }
2980 /* FALL THROUGH */
2981
a0d0e21e 2982 default: {
ba210ebe 2983 register STRLEN len;
a0ed51b3 2984 register UV ender;
a0d0e21e 2985 register char *p;
c277df42 2986 char *oldp, *s;
ba210ebe 2987 STRLEN numlen;
a2a2844f 2988 STRLEN ulen;
e7ae6809 2989 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
f06dbbb7
JH
2990
2991 parse_start = RExC_parse - 1;
a0d0e21e 2992
830247a4 2993 RExC_parse++;
a0d0e21e
LW
2994
2995 defchar:
830247a4 2996 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2997 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2998 : EXACT);
cd439c50 2999 s = STRING(ret);
830247a4
IZ
3000 for (len = 0, p = RExC_parse - 1;
3001 len < 127 && p < RExC_end;
a0d0e21e
LW
3002 len++)
3003 {
3004 oldp = p;
5b5a24f7 3005
830247a4
IZ
3006 if (RExC_flags16 & PMf_EXTENDED)
3007 p = regwhite(p, RExC_end);
a0d0e21e
LW
3008 switch (*p) {
3009 case '^':
3010 case '$':
3011 case '.':
3012 case '[':
3013 case '(':
3014 case ')':
3015 case '|':
3016 goto loopdone;
3017 case '\\':
3018 switch (*++p) {
3019 case 'A':
1ed8eac0
JF
3020 case 'C':
3021 case 'X':
a0d0e21e
LW
3022 case 'G':
3023 case 'Z':
b85d18e9 3024 case 'z':
a0d0e21e
LW
3025 case 'w':
3026 case 'W':
3027 case 'b':
3028 case 'B':
3029 case 's':
3030 case 'S':
3031 case 'd':
3032 case 'D':
a14b48bc
LW
3033 case 'p':
3034 case 'P':
a0d0e21e
LW
3035 --p;
3036 goto loopdone;
3037 case 'n':
3038 ender = '\n';
3039 p++;
a687059c 3040 break;
a0d0e21e
LW
3041 case 'r':
3042 ender = '\r';
3043 p++;
a687059c 3044 break;
a0d0e21e
LW
3045 case 't':
3046 ender = '\t';
3047 p++;
a687059c 3048 break;
a0d0e21e
LW
3049 case 'f':
3050 ender = '\f';
3051 p++;
a687059c 3052 break;
a0d0e21e 3053 case 'e':
c7f1f016 3054 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3055 p++;
a687059c 3056 break;
a0d0e21e 3057 case 'a':
c7f1f016 3058 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3059 p++;
a687059c 3060 break;
a0d0e21e 3061 case 'x':
a0ed51b3
LW
3062 if (*++p == '{') {
3063 char* e = strchr(p, '}');
b81d288d 3064
b45f050a 3065 if (!e) {
830247a4 3066 RExC_parse = p + 1;
b45f050a
JF
3067 vFAIL("Missing right brace on \\x{}");
3068 }
de5f0749 3069 else {
a4c04bdc
NC
3070 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3071 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3072 numlen = e - p - 1;
3073 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3074 if (ender > 0xff)
3075 RExC_utf8 = 1;
b21ed0a9
GS
3076 /* numlen is generous */
3077 if (numlen + len >= 127) {
a0ed51b3
LW
3078 p--;
3079 goto loopdone;
3080 }
3081 p = e + 1;
3082 }
a0ed51b3
LW
3083 }
3084 else {
a4c04bdc 3085 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3086 numlen = 2;
3087 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3088 p += numlen;
3089 }
a687059c 3090 break;
a0d0e21e
LW
3091 case 'c':
3092 p++;
bbce6d69 3093 ender = UCHARAT(p++);
3094 ender = toCTRL(ender);
a687059c 3095 break;
a0d0e21e
LW
3096 case '0': case '1': case '2': case '3':case '4':
3097 case '5': case '6': case '7': case '8':case '9':
3098 if (*p == '0' ||
830247a4 3099 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3100 I32 flags = 0;
3101 numlen = 3;
3102 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3103 p += numlen;
3104 }
3105 else {
3106 --p;
3107 goto loopdone;
a687059c
LW
3108 }
3109 break;
a0d0e21e 3110 case '\0':
830247a4 3111 if (p >= RExC_end)
b45f050a 3112 FAIL("Trailing \\");
a687059c 3113 /* FALL THROUGH */
a0d0e21e 3114 default:
e476b1b5 3115 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3116 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3117 goto normal_default;
a0d0e21e
LW
3118 }
3119 break;
a687059c 3120 default:
a0ed51b3 3121 normal_default:
fd400ab9 3122 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3123 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3124 &numlen, 0);
a0ed51b3
LW
3125 p += numlen;
3126 }
3127 else
3128 ender = *p++;
a0d0e21e 3129 break;
a687059c 3130 }
830247a4
IZ
3131 if (RExC_flags16 & PMf_EXTENDED)
3132 p = regwhite(p, RExC_end);
a0ed51b3 3133 if (UTF && FOLD) {
22c54be3 3134 toFOLD_uni(ender, tmpbuf, &ulen);
a2a2844f 3135 ender = utf8_to_uvchr(tmpbuf, 0);
a0ed51b3 3136 }
a0d0e21e
LW
3137 if (ISMULT2(p)) { /* Back off on ?+*. */
3138 if (len)
3139 p = oldp;
2b9d42f0 3140 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3141 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3142 s += numlen;
3143 len += numlen;
3144 }
a0d0e21e
LW
3145 else {
3146 len++;
cd439c50 3147 REGC(ender, s++);
a0d0e21e
LW
3148 }
3149 break;
a687059c 3150 }
2b9d42f0 3151 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3152 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3153 s += numlen;
3154 len += numlen - 1;
3155 }
3156 else
cd439c50 3157 REGC(ender, s++);
a0d0e21e
LW
3158 }
3159 loopdone:
830247a4 3160 RExC_parse = p - 1;
fac92740 3161 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3162 nextchar(pRExC_state);
793db0cb
JH
3163 {
3164 /* len is STRLEN which is unsigned, need to copy to signed */
3165 IV iv = len;
3166 if (iv < 0)
3167 vFAIL("Internal disaster");
3168 }
a0d0e21e
LW
3169 if (len > 0)
3170 *flagp |= HASWIDTH;
3171 if (len == 1)
3172 *flagp |= SIMPLE;
c277df42 3173 if (!SIZE_ONLY)
cd439c50
IZ
3174 STR_LEN(ret) = len;
3175 if (SIZE_ONLY)
830247a4 3176 RExC_size += STR_SZ(len);
cd439c50 3177 else
830247a4 3178 RExC_emit += STR_SZ(len);
a687059c 3179 }
a0d0e21e
LW
3180 break;
3181 }
a687059c 3182
22c54be3 3183 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3184 STRLEN oldlen = STR_LEN(ret);
3185 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3186
3187 if (RExC_utf8)
3188 SvUTF8_on(sv);
3189 if (sv_utf8_downgrade(sv, TRUE)) {
3190 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3191 STRLEN newlen = SvCUR(sv);
3192
3193 if (!SIZE_ONLY) {
3194 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3195 (int)oldlen, STRING(ret),
3196 (int)newlen, s));
3197 Copy(s, STRING(ret), newlen, char);
3198 STR_LEN(ret) += newlen - oldlen;
3199 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3200 } else
3201 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3202 }
a72c7584
JH
3203 }
3204
a0d0e21e 3205 return(ret);
a687059c
LW
3206}
3207
873ef191 3208STATIC char *
cea2e8a9 3209S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3210{
3211 while (p < e) {
3212 if (isSPACE(*p))
3213 ++p;
3214 else if (*p == '#') {
3215 do {
3216 p++;
3217 } while (p < e && *p != '\n');
3218 }
3219 else
3220 break;
3221 }
3222 return p;
3223}
3224
b8c5462f
JH
3225/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3226 Character classes ([:foo:]) can also be negated ([:^foo:]).
3227 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3228 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3229 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3230
3231#define POSIXCC_DONE(c) ((c) == ':')
3232#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3233#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3234
b8c5462f 3235STATIC I32
830247a4 3236S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3237{
3238 char *posixcc = 0;
936ed897 3239 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3240
830247a4 3241 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3242 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3243 POSIXCC(UCHARAT(RExC_parse))) {
3244 char c = UCHARAT(RExC_parse);
830247a4 3245 char* s = RExC_parse++;
b81d288d 3246
9a86a77b 3247 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3248 RExC_parse++;
3249 if (RExC_parse == RExC_end)
620e46c5 3250 /* Grandfather lone [:, [=, [. */
830247a4 3251 RExC_parse = s;
620e46c5 3252 else {
830247a4 3253 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3254
9a86a77b 3255 if (UCHARAT(RExC_parse) == ']') {
830247a4 3256 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3257 posixcc = s + 1;
3258 if (*s == ':') {
3259 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3260 I32 skip = 5; /* the most common skip */
3261
3262 switch (*posixcc) {
3263 case 'a':
3264 if (strnEQ(posixcc, "alnum", 5))
3265 namedclass =
3266 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3267 else if (strnEQ(posixcc, "alpha", 5))
3268 namedclass =
3269 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3270 else if (strnEQ(posixcc, "ascii", 5))
3271 namedclass =
3272 complement ? ANYOF_NASCII : ANYOF_ASCII;
3273 break;
aaa51d5e
JF
3274 case 'b':
3275 if (strnEQ(posixcc, "blank", 5))
3276 namedclass =
3277 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3278 break;
b8c5462f
JH
3279 case 'c':
3280 if (strnEQ(posixcc, "cntrl", 5))
3281 namedclass =
3282 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3283 break;
3284 case 'd':
3285 if (strnEQ(posixcc, "digit", 5))
3286 namedclass =
3287 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3288 break;
3289 case 'g':
3290 if (strnEQ(posixcc, "graph", 5))
3291 namedclass =
3292 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3293 break;
3294 case 'l':
3295 if (strnEQ(posixcc, "lower", 5))
3296 namedclass =
3297 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3298 break;
3299 case 'p':
3300 if (strnEQ(posixcc, "print", 5))
3301 namedclass =
3302 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3303 else if (strnEQ(posixcc, "punct", 5))
3304 namedclass =
3305 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3306 break;
3307 case 's':
3308 if (strnEQ(posixcc, "space", 5))
3309 namedclass =
aaa51d5e 3310 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3311 break;
b8c5462f
JH
3312 case 'u':
3313 if (strnEQ(posixcc, "upper", 5))
3314 namedclass =
3315 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3316 break;
3317 case 'w': /* this is not POSIX, this is the Perl \w */
3318 if (strnEQ(posixcc, "word", 4)) {
3319 namedclass =
3320 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3321 skip = 4;
3322 }
3323 break;
3324 case 'x':
3325 if (strnEQ(posixcc, "xdigit", 6)) {
3326 namedclass =
3327 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3328 skip = 6;
3329 }
3330 break;
3331 }
ac561586
JH
3332 if (namedclass == OOB_NAMEDCLASS ||
3333 posixcc[skip] != ':' ||
3334 posixcc[skip+1] != ']')
b45f050a
JF
3335 {
3336 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3337 t - s - 1, s + 1);
3338 }
3339 } else if (!SIZE_ONLY) {
b8c5462f 3340 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3341
830247a4 3342 /* adjust RExC_parse so the warning shows after
b45f050a 3343 the class closes */
9a86a77b 3344 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3345 RExC_parse++;
b45f050a
JF
3346 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3347 }
b8c5462f
JH
3348 } else {
3349 /* Maternal grandfather:
3350 * "[:" ending in ":" but not in ":]" */
830247a4 3351 RExC_parse = s;
767d463e 3352 }
620e46c5
JH
3353 }
3354 }
3355
b8c5462f
JH
3356 return namedclass;
3357}
3358
3359STATIC void
830247a4 3360S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3361{
e476b1b5 3362 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
9a86a77b 3363 POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3364 char *s = RExC_parse;
93733859 3365 char c = *s++;
b8c5462f
JH
3366
3367 while(*s && isALNUM(*s))
3368 s++;
3369 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3370 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3371
3372 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3373 if (POSIXCC_NOTYET(c)) {
830247a4 3374 /* adjust RExC_parse so the error shows after
b45f050a 3375 the class closes */
9a86a77b 3376 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3377 ;
3378 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3379 }
b8c5462f
JH
3380 }
3381 }
620e46c5
JH
3382}
3383
76e3520e 3384STATIC regnode *
830247a4 3385S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3386{
ffc61ed2 3387 register UV value;
9a86a77b 3388 register UV nextvalue;
3568d838 3389 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3390 register IV range = 0;
c277df42 3391 register regnode *ret;
ba210ebe 3392 STRLEN numlen;
ffc61ed2 3393 IV namedclass;
9c5ffd7c 3394 char *rangebegin = 0;
936ed897 3395 bool need_class = 0;
9c5ffd7c 3396 SV *listsv = Nullsv;
ffc61ed2
JH
3397 register char *e;
3398 UV n;
3568d838 3399 bool optimize_invert = TRUE;
ffc61ed2
JH
3400
3401 ret = reganode(pRExC_state, ANYOF, 0);
3402
3403 if (!SIZE_ONLY)
3404 ANYOF_FLAGS(ret) = 0;
3405
9a86a77b 3406 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3407 RExC_naughty++;
3408 RExC_parse++;
3409 if (!SIZE_ONLY)
3410 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3411 }
a0d0e21e 3412
936ed897 3413 if (SIZE_ONLY)
830247a4 3414 RExC_size += ANYOF_SKIP;
936ed897 3415 else {
830247a4 3416 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3417 if (FOLD)
3418 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3419 if (LOC)
3420 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3421 ANYOF_BITMAP_ZERO(ret);
3422 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3423 }
b8c5462f 3424
9a86a77b
JH
3425 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3426
3427 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
830247a4 3428 checkposixcc(pRExC_state);
b8c5462f 3429
9a86a77b 3430 if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
ffc61ed2
JH
3431 goto charclassloop; /* allow 1st char to be ] or - */
3432
9a86a77b 3433 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3434
3435 charclassloop:
3436
3437 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3438
73b437c8 3439 if (!range)
830247a4 3440 rangebegin = RExC_parse;
ffc61ed2 3441 if (UTF) {
5e12f4fb 3442 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3443 RExC_end - RExC_parse,
3444 &numlen, 0);
ffc61ed2
JH
3445 RExC_parse += numlen;
3446 }
3447 else
3448 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3449 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3450 if (value == '[' && POSIXCC(nextvalue))
830247a4 3451 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3452 else if (value == '\\') {
ffc61ed2 3453 if (UTF) {
5e12f4fb 3454 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3455 RExC_end - RExC_parse,
3456 &numlen, 0);