This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Retract #10324 and #10333; not needed.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
bc89e66f 72 **** Copyright (c) 1991-2001, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
c5be433b
GS
86#ifdef PERL_IN_XSUB_RE
87# if defined(PERL_CAPI) || defined(PERL_OBJECT)
88# include "XSUB.h"
89# endif
90#else
d06ea78c
GS
91# include "INTERN.h"
92#endif
c277df42
IZ
93
94#define REG_COMP_C
a687059c
LW
95#include "regcomp.h"
96
d4cce5f1 97#ifdef op
11343788 98#undef op
d4cce5f1 99#endif /* op */
11343788 100
fe14fcc3
LW
101#ifdef MSDOS
102# if defined(BUGGY_MSC6)
103 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104 # pragma optimize("a",off)
105 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106 # pragma optimize("w",on )
107# endif /* BUGGY_MSC6 */
108#endif /* MSDOS */
109
a687059c
LW
110#ifndef STATIC
111#define STATIC static
112#endif
113
830247a4
IZ
114typedef struct RExC_state_t {
115 U16 flags16; /* are we folding, multilining? */
116 char *precomp; /* uncompiled string. */
117 regexp *rx;
fac92740 118 char *start; /* Start of input for compile */
830247a4
IZ
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 122 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 123 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
124 I32 naughty; /* How bad is this pattern? */
125 I32 sawback; /* Did we see \1, ...? */
126 U32 seen;
127 I32 size; /* Code size. */
128 I32 npar; /* () count. */
129 I32 extralen;
130 I32 seen_zerolen;
131 I32 seen_evals;
1aa99e6b 132 I32 utf8;
830247a4
IZ
133#if ADD_TO_REGEXEC
134 char *starttry; /* -Dr: where regtry was called. */
135#define RExC_starttry (pRExC_state->starttry)
136#endif
137} RExC_state_t;
138
139#define RExC_flags16 (pRExC_state->flags16)
140#define RExC_precomp (pRExC_state->precomp)
141#define RExC_rx (pRExC_state->rx)
fac92740 142#define RExC_start (pRExC_state->start)
830247a4
IZ
143#define RExC_end (pRExC_state->end)
144#define RExC_parse (pRExC_state->parse)
145#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 146#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 147#define RExC_emit (pRExC_state->emit)
fac92740 148#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
149#define RExC_naughty (pRExC_state->naughty)
150#define RExC_sawback (pRExC_state->sawback)
151#define RExC_seen (pRExC_state->seen)
152#define RExC_size (pRExC_state->size)
153#define RExC_npar (pRExC_state->npar)
154#define RExC_extralen (pRExC_state->extralen)
155#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
156#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 157#define RExC_utf8 (pRExC_state->utf8)
830247a4 158
a687059c
LW
159#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
160#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
161 ((*s) == '{' && regcurly(s)))
a687059c 162
35c8bce7
LW
163#ifdef SPSTART
164#undef SPSTART /* dratted cpp namespace... */
165#endif
a687059c
LW
166/*
167 * Flags to be passed up and down.
168 */
a687059c 169#define WORST 0 /* Worst case. */
821b33a5 170#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
171#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
172#define SPSTART 0x4 /* Starts with * or +. */
173#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 174
2c2d71f5
JH
175/* Length of a variant. */
176
177typedef struct scan_data_t {
178 I32 len_min;
179 I32 len_delta;
180 I32 pos_min;
181 I32 pos_delta;
182 SV *last_found;
183 I32 last_end; /* min value, <0 unless valid. */
184 I32 last_start_min;
185 I32 last_start_max;
186 SV **longest; /* Either &l_fixed, or &l_float. */
187 SV *longest_fixed;
188 I32 offset_fixed;
189 SV *longest_float;
190 I32 offset_float_min;
191 I32 offset_float_max;
192 I32 flags;
193 I32 whilem_c;
cb434fcc 194 I32 *last_closep;
653099ff 195 struct regnode_charclass_class *start_class;
2c2d71f5
JH
196} scan_data_t;
197
a687059c 198/*
e50aee73 199 * Forward declarations for pregcomp()'s friends.
a687059c 200 */
a0d0e21e 201
b81d288d 202static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 203 0, 0, 0, 0, 0, 0};
c277df42
IZ
204
205#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
206#define SF_BEFORE_SEOL 0x1
207#define SF_BEFORE_MEOL 0x2
208#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
209#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
210
09b7f37c
CB
211#ifdef NO_UNARY_PLUS
212# define SF_FIX_SHIFT_EOL (0+2)
213# define SF_FL_SHIFT_EOL (0+4)
214#else
215# define SF_FIX_SHIFT_EOL (+2)
216# define SF_FL_SHIFT_EOL (+4)
217#endif
c277df42
IZ
218
219#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
220#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
221
222#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
223#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
224#define SF_IS_INF 0x40
225#define SF_HAS_PAR 0x80
226#define SF_IN_PAR 0x100
227#define SF_HAS_EVAL 0x200
4bfe0158 228#define SCF_DO_SUBSTR 0x400
653099ff
GS
229#define SCF_DO_STCLASS_AND 0x0800
230#define SCF_DO_STCLASS_OR 0x1000
231#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 232#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 233
1aa99e6b 234#define UTF RExC_utf8
830247a4
IZ
235#define LOC (RExC_flags16 & PMf_LOCALE)
236#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 237
ffc61ed2 238#define OOB_UNICODE 12345678
93733859 239#define OOB_NAMEDCLASS -1
b8c5462f 240
a0ed51b3
LW
241#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
8615cb43 244
b45f050a
JF
245/* length of regex to show in messages that don't mark a position within */
246#define RegexLengthToShowInErrorMessages 127
247
248/*
249 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251 * op/pragma/warn/regcomp.
252 */
7253e4e3
RK
253#define MARKER1 "<-- HERE" /* marker as it appears in the description */
254#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 255
7253e4e3 256#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
257
258/*
259 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 * "...".
262 */
a4eb266f 263#define FAIL(msg) \
8615cb43 264 STMT_START { \
a4eb266f 265 char *ellipses = ""; \
830247a4 266 unsigned len = strlen(RExC_precomp); \
b45f050a 267 \
8615cb43 268 if (!SIZE_ONLY) \
830247a4 269 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
270 \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 274 ellipses = "..."; \
b45f050a
JF
275 } \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 277 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
278 } STMT_END
279
b45f050a
JF
280/*
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
283 * "...".
284 */
a4eb266f 285#define FAIL2(pat,msg) \
8615cb43 286 STMT_START { \
a4eb266f 287 char *ellipses = ""; \
830247a4 288 unsigned len = strlen(RExC_precomp); \
b45f050a 289 \
8615cb43 290 if (!SIZE_ONLY) \
830247a4 291 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 292 \
b45f050a
JF
293 if (len > RegexLengthToShowInErrorMessages) { \
294 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
295 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 296 ellipses = "..."; \
b45f050a
JF
297 } \
298 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 299 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
300 } STMT_END
301
302
303/*
304 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305 */
306#define Simple_vFAIL(m) \
307 STMT_START { \
830247a4 308 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
309 \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
312 } STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) \
318 STMT_START { \
319 if (!SIZE_ONLY) \
830247a4 320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
321 Simple_vFAIL(m); \
322 } STMT_END
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
327#define Simple_vFAIL2(m,a1) \
328 STMT_START { \
830247a4 329 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
330 \
331 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 332 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
333 } STMT_END
334
335/*
336 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337 */
338#define vFAIL2(m,a1) \
339 STMT_START { \
340 if (!SIZE_ONLY) \
830247a4 341 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
342 Simple_vFAIL2(m, a1); \
343 } STMT_END
344
345
346/*
347 * Like Simple_vFAIL(), but accepts three arguments.
348 */
349#define Simple_vFAIL3(m, a1, a2) \
350 STMT_START { \
830247a4 351 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
352 \
353 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 354 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
355 } STMT_END
356
357/*
358 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359 */
360#define vFAIL3(m,a1,a2) \
361 STMT_START { \
362 if (!SIZE_ONLY) \
830247a4 363 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
364 Simple_vFAIL3(m, a1, a2); \
365 } STMT_END
366
367/*
368 * Like Simple_vFAIL(), but accepts four arguments.
369 */
370#define Simple_vFAIL4(m, a1, a2, a3) \
371 STMT_START { \
830247a4 372 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a
JF
373 \
374 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 375 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
376 } STMT_END
377
378/*
379 * Like Simple_vFAIL(), but accepts five arguments.
380 */
381#define Simple_vFAIL5(m, a1, a2, a3, a4) \
382 STMT_START { \
830247a4 383 unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
b45f050a 384 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 385 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
386 } STMT_END
387
388
389#define vWARN(loc,m) \
390 STMT_START { \
830247a4 391 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a 392 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 393 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
394 } STMT_END \
395
396
397#define vWARN2(loc, m, a1) \
398 STMT_START { \
830247a4 399 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
400 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
401 a1, \
830247a4 402 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
403 } STMT_END
404
405#define vWARN3(loc, m, a1, a2) \
406 STMT_START { \
830247a4 407 unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
b45f050a
JF
408 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
409 a1, a2, \
830247a4 410 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
411 } STMT_END
412
413#define vWARN4(loc, m, a1, a2, a3) \
414 STMT_START { \
830247a4 415 unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
b45f050a
JF
416 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
417 a1, a2, a3, \
830247a4 418 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
419 } STMT_END
420
421
cd439c50
IZ
422/* Allow for side effects in s */
423#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
424
fac92740
MJD
425/* Macros for recording node offsets. 20001227 mjd@plover.com
426 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
427 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
428 * Element 0 holds the number n.
429 */
430
431#define MJD_OFFSET_DEBUG(x)
432/* #define MJD_OFFSET_DEBUG(x) fprintf x */
433
434
435# define Set_Node_Offset_To_R(node,byte) \
436 STMT_START { \
437 if (! SIZE_ONLY) { \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444 } STMT_END
445
446# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
447# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
448
449# define Set_Node_Length_To_R(node,len) \
450 STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
453 if((node) < 0) { \
454 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
455 } else { \
456 RExC_offsets[2*(node)] = (len); \
457 } \
458 } \
459 } STMT_END
460
461# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
462# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
463# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
464
465/* Get offsets and lengths */
466#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
467#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
468
51371543 469static void clear_re(pTHXo_ void *r);
4327152a 470
653099ff
GS
471/* Mark that we cannot extend a found fixed substring at this point.
472 Updata the longest found anchored substring and the longest found
473 floating substrings if needed. */
474
4327152a 475STATIC void
830247a4 476S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 477{
a0ed51b3
LW
478 STRLEN l = CHR_SVLEN(data->last_found);
479 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 480
c277df42
IZ
481 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
482 sv_setsv(*data->longest, data->last_found);
483 if (*data->longest == data->longest_fixed) {
484 data->offset_fixed = l ? data->last_start_min : data->pos_min;
485 if (data->flags & SF_BEFORE_EOL)
b81d288d 486 data->flags
c277df42
IZ
487 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
488 else
489 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
490 }
491 else {
c277df42 492 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
493 data->offset_float_max = (l
494 ? data->last_start_max
c277df42
IZ
495 : data->pos_min + data->pos_delta);
496 if (data->flags & SF_BEFORE_EOL)
b81d288d 497 data->flags
c277df42
IZ
498 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
499 else
500 data->flags &= ~SF_FL_BEFORE_EOL;
501 }
502 }
503 SvCUR_set(data->last_found, 0);
504 data->last_end = -1;
505 data->flags &= ~SF_BEFORE_EOL;
506}
507
653099ff
GS
508/* Can match anything (initialization) */
509STATIC void
830247a4 510S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 511{
9af0acdb
JH
512 int value;
513
653099ff 514 ANYOF_CLASS_ZERO(cl);
9af0acdb
JH
515 for (value = 0; value < 256; ++value)
516 ANYOF_BITMAP_SET(cl, value);
1aa99e6b 517 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
518 if (LOC)
519 cl->flags |= ANYOF_LOCALE;
520}
521
522/* Can match anything (initialization) */
523STATIC int
524S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
525{
526 int value;
527
aaa51d5e 528 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
529 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 return 1;
1aa99e6b
IH
531 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 return 0;
9af0acdb
JH
533 for (value = 0; value < 256; ++value)
534 if (!ANYOF_BITMAP_TEST(cl, value))
653099ff
GS
535 return 0;
536 return 1;
537}
538
539/* Can match anything (initialization) */
540STATIC void
830247a4 541S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 542{
8ecf7187 543 Zero(cl, 1, struct regnode_charclass_class);
653099ff 544 cl->type = ANYOF;
830247a4 545 cl_anything(pRExC_state, cl);
653099ff
GS
546}
547
548STATIC void
830247a4 549S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554 if (LOC)
555 cl->flags |= ANYOF_LOCALE;
556}
557
558/* 'And' a given class with another one. Can create false positives */
559/* We assume that cl is not inverted */
560STATIC void
561S_cl_and(pTHX_ struct regnode_charclass_class *cl,
562 struct regnode_charclass_class *and_with)
563{
653099ff
GS
564 if (!(and_with->flags & ANYOF_CLASS)
565 && !(cl->flags & ANYOF_CLASS)
566 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
567 && !(and_with->flags & ANYOF_FOLD)
568 && !(cl->flags & ANYOF_FOLD)) {
569 int i;
570
571 if (and_with->flags & ANYOF_INVERT)
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= ~and_with->bitmap[i];
574 else
575 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
576 cl->bitmap[i] &= and_with->bitmap[i];
577 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
578 if (!(and_with->flags & ANYOF_EOS))
579 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
580
581 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
582 cl->flags &= ~ANYOF_UNICODE_ALL;
583 cl->flags |= ANYOF_UNICODE;
584 ARG_SET(cl, ARG(and_with));
585 }
586 if (!(and_with->flags & ANYOF_UNICODE_ALL))
587 cl->flags &= ~ANYOF_UNICODE_ALL;
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
830247a4 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
652/* REx optimizer. Converts nodes into quickier variants "in place".
653 Finds fixed substrings. */
654
c277df42
IZ
655/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
656 to the position after last scanned or to NULL. */
657
76e3520e 658STATIC I32
830247a4 659S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
660 /* scanp: Start here (read-write). */
661 /* deltap: Write maxlen-minlen here. */
662 /* last: Stop before this one. */
663{
664 I32 min = 0, pars = 0, code;
665 regnode *scan = *scanp, *next;
666 I32 delta = 0;
667 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 668 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
669 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
670 scan_data_t data_fake;
653099ff 671 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 672
c277df42
IZ
673 while (scan && OP(scan) != END && scan < last) {
674 /* Peephole optimizer: */
675
22c35a8c 676 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 677 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
678 regnode *n = regnext(scan);
679 U32 stringok = 1;
680#ifdef DEBUGGING
681 regnode *stop = scan;
b81d288d 682#endif
c277df42 683
cd439c50 684 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
685 /* Skip NOTHING, merge EXACT*. */
686 while (n &&
b81d288d 687 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
688 (stringok && (OP(n) == OP(scan))))
689 && NEXT_OFF(n)
690 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
691 if (OP(n) == TAIL || n > next)
692 stringok = 0;
22c35a8c 693 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
694 NEXT_OFF(scan) += NEXT_OFF(n);
695 next = n + NODE_STEP_REGNODE;
696#ifdef DEBUGGING
697 if (stringok)
698 stop = n;
b81d288d 699#endif
c277df42 700 n = regnext(n);
a0ed51b3 701 }
f49d4d0f 702 else if (stringok) {
cd439c50 703 int oldl = STR_LEN(scan);
c277df42 704 regnode *nnext = regnext(n);
f49d4d0f 705
b81d288d 706 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
707 break;
708 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
709 STR_LEN(scan) += STR_LEN(n);
710 next = n + NODE_SZ_STR(n);
c277df42 711 /* Now we can overwrite *n : */
f49d4d0f 712 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 713#ifdef DEBUGGING
f49d4d0f 714 stop = next - 1;
b81d288d 715#endif
c277df42
IZ
716 n = nnext;
717 }
718 }
719#ifdef DEBUGGING
720 /* Allow dumping */
cd439c50 721 n = scan + NODE_SZ_STR(scan);
c277df42 722 while (n <= stop) {
22c35a8c 723 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
724 OP(n) = OPTIMIZED;
725 NEXT_OFF(n) = 0;
726 }
727 n++;
728 }
653099ff 729#endif
c277df42 730 }
653099ff
GS
731 /* Follow the next-chain of the current node and optimize
732 away all the NOTHINGs from it. */
c277df42 733 if (OP(scan) != CURLYX) {
048cfca1
GS
734 int max = (reg_off_by_arg[OP(scan)]
735 ? I32_MAX
736 /* I32 may be smaller than U16 on CRAYs! */
737 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
738 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
739 int noff;
740 regnode *n = scan;
b81d288d 741
c277df42
IZ
742 /* Skip NOTHING and LONGJMP. */
743 while ((n = regnext(n))
22c35a8c 744 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
745 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
746 && off + noff < max)
747 off += noff;
748 if (reg_off_by_arg[OP(scan)])
749 ARG(scan) = off;
b81d288d 750 else
c277df42
IZ
751 NEXT_OFF(scan) = off;
752 }
653099ff
GS
753 /* The principal pseudo-switch. Cannot be a switch, since we
754 look into several different things. */
b81d288d 755 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
756 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
757 next = regnext(scan);
758 code = OP(scan);
b81d288d
AB
759
760 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 761 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 762 struct regnode_charclass_class accum;
c277df42 763
653099ff 764 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 765 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 766 if (flags & SCF_DO_STCLASS)
830247a4 767 cl_init_zero(pRExC_state, &accum);
c277df42 768 while (OP(scan) == code) {
830247a4 769 I32 deltanext, minnext, f = 0, fake;
653099ff 770 struct regnode_charclass_class this_class;
c277df42
IZ
771
772 num++;
773 data_fake.flags = 0;
b81d288d 774 if (data) {
2c2d71f5 775 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
776 data_fake.last_closep = data->last_closep;
777 }
778 else
779 data_fake.last_closep = &fake;
c277df42
IZ
780 next = regnext(scan);
781 scan = NEXTOPER(scan);
782 if (code != BRANCH)
783 scan = NEXTOPER(scan);
653099ff 784 if (flags & SCF_DO_STCLASS) {
830247a4 785 cl_init(pRExC_state, &this_class);
653099ff
GS
786 data_fake.start_class = &this_class;
787 f = SCF_DO_STCLASS_AND;
b81d288d 788 }
e1901655
IZ
789 if (flags & SCF_WHILEM_VISITED_POS)
790 f |= SCF_WHILEM_VISITED_POS;
653099ff 791 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
792 minnext = study_chunk(pRExC_state, &scan, &deltanext,
793 next, &data_fake, f);
b81d288d 794 if (min1 > minnext)
c277df42
IZ
795 min1 = minnext;
796 if (max1 < minnext + deltanext)
797 max1 = minnext + deltanext;
798 if (deltanext == I32_MAX)
aca2d497 799 is_inf = is_inf_internal = 1;
c277df42
IZ
800 scan = next;
801 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
802 pars++;
405ff068 803 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 804 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
805 if (data)
806 data->whilem_c = data_fake.whilem_c;
653099ff 807 if (flags & SCF_DO_STCLASS)
830247a4 808 cl_or(pRExC_state, &accum, &this_class);
b81d288d 809 if (code == SUSPEND)
c277df42
IZ
810 break;
811 }
812 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
813 min1 = 0;
814 if (flags & SCF_DO_SUBSTR) {
815 data->pos_min += min1;
816 data->pos_delta += max1 - min1;
817 if (max1 != min1 || is_inf)
818 data->longest = &(data->longest_float);
819 }
820 min += min1;
821 delta += max1 - min1;
653099ff 822 if (flags & SCF_DO_STCLASS_OR) {
830247a4 823 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
824 if (min1) {
825 cl_and(data->start_class, &and_with);
826 flags &= ~SCF_DO_STCLASS;
827 }
828 }
829 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
830 if (min1) {
831 cl_and(data->start_class, &accum);
653099ff 832 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
833 }
834 else {
b81d288d 835 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
836 * data->start_class */
837 StructCopy(data->start_class, &and_with,
838 struct regnode_charclass_class);
839 flags &= ~SCF_DO_STCLASS_AND;
840 StructCopy(&accum, data->start_class,
841 struct regnode_charclass_class);
842 flags |= SCF_DO_STCLASS_OR;
843 data->start_class->flags |= ANYOF_EOS;
844 }
653099ff 845 }
a0ed51b3
LW
846 }
847 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
848 scan = NEXTOPER(NEXTOPER(scan));
849 else /* single branch is optimized. */
850 scan = NEXTOPER(scan);
851 continue;
a0ed51b3
LW
852 }
853 else if (OP(scan) == EXACT) {
cd439c50 854 I32 l = STR_LEN(scan);
1aa99e6b 855 UV uc = *((U8*)STRING(scan));
a0ed51b3 856 if (UTF) {
1aa99e6b
IH
857 U8 *s = (U8*)STRING(scan);
858 l = utf8_length(s, s + l);
9041c2e3 859 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
860 }
861 min += l;
c277df42 862 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
863 /* The code below prefers earlier match for fixed
864 offset, later match for variable offset. */
865 if (data->last_end == -1) { /* Update the start info. */
866 data->last_start_min = data->pos_min;
867 data->last_start_max = is_inf
b81d288d 868 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 869 }
cd439c50 870 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
871 data->last_end = data->pos_min + l;
872 data->pos_min += l; /* As in the first entry. */
873 data->flags &= ~SF_BEFORE_EOL;
874 }
653099ff
GS
875 if (flags & SCF_DO_STCLASS_AND) {
876 /* Check whether it is compatible with what we know already! */
877 int compat = 1;
878
1aa99e6b 879 if (uc >= 0x100 ||
516a5887 880 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 881 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 882 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 883 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 884 )
653099ff
GS
885 compat = 0;
886 ANYOF_CLASS_ZERO(data->start_class);
887 ANYOF_BITMAP_ZERO(data->start_class);
888 if (compat)
1aa99e6b 889 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 890 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
891 if (uc < 0x100)
892 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
893 }
894 else if (flags & SCF_DO_STCLASS_OR) {
895 /* false positive possible if the class is case-folded */
1aa99e6b 896 if (uc < 0x100)
9b877dbb
IH
897 ANYOF_BITMAP_SET(data->start_class, uc);
898 else
899 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
900 data->start_class->flags &= ~ANYOF_EOS;
901 cl_and(data->start_class, &and_with);
902 }
903 flags &= ~SCF_DO_STCLASS;
a0ed51b3 904 }
653099ff 905 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 906 I32 l = STR_LEN(scan);
1aa99e6b 907 UV uc = *((U8*)STRING(scan));
653099ff
GS
908
909 /* Search for fixed substrings supports EXACT only. */
b81d288d 910 if (flags & SCF_DO_SUBSTR)
830247a4 911 scan_commit(pRExC_state, data);
a0ed51b3 912 if (UTF) {
1aa99e6b
IH
913 U8 *s = (U8 *)STRING(scan);
914 l = utf8_length(s, s + l);
9041c2e3 915 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
916 }
917 min += l;
c277df42 918 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 919 data->pos_min += l;
653099ff
GS
920 if (flags & SCF_DO_STCLASS_AND) {
921 /* Check whether it is compatible with what we know already! */
922 int compat = 1;
923
1aa99e6b 924 if (uc >= 0x100 ||
516a5887 925 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 926 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 927 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
928 compat = 0;
929 ANYOF_CLASS_ZERO(data->start_class);
930 ANYOF_BITMAP_ZERO(data->start_class);
931 if (compat) {
1aa99e6b 932 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
933 data->start_class->flags &= ~ANYOF_EOS;
934 data->start_class->flags |= ANYOF_FOLD;
935 if (OP(scan) == EXACTFL)
936 data->start_class->flags |= ANYOF_LOCALE;
937 }
938 }
939 else if (flags & SCF_DO_STCLASS_OR) {
940 if (data->start_class->flags & ANYOF_FOLD) {
941 /* false positive possible if the class is case-folded.
942 Assume that the locale settings are the same... */
1aa99e6b
IH
943 if (uc < 0x100)
944 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
945 data->start_class->flags &= ~ANYOF_EOS;
946 }
947 cl_and(data->start_class, &and_with);
948 }
949 flags &= ~SCF_DO_STCLASS;
a0ed51b3 950 }
4d61ec05 951 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 952 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 953 I32 f = flags, pos_before = 0;
c277df42 954 regnode *oscan = scan;
653099ff
GS
955 struct regnode_charclass_class this_class;
956 struct regnode_charclass_class *oclass = NULL;
957
22c35a8c 958 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 959 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
960 scan = NEXTOPER(scan);
961 goto finish;
962 case PLUS:
653099ff 963 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 964 next = NEXTOPER(scan);
653099ff 965 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
966 mincount = 1;
967 maxcount = REG_INFTY;
c277df42
IZ
968 next = regnext(scan);
969 scan = NEXTOPER(scan);
970 goto do_curly;
971 }
972 }
973 if (flags & SCF_DO_SUBSTR)
974 data->pos_min++;
975 min++;
976 /* Fall through. */
977 case STAR:
653099ff
GS
978 if (flags & SCF_DO_STCLASS) {
979 mincount = 0;
b81d288d 980 maxcount = REG_INFTY;
653099ff
GS
981 next = regnext(scan);
982 scan = NEXTOPER(scan);
983 goto do_curly;
984 }
b81d288d 985 is_inf = is_inf_internal = 1;
c277df42
IZ
986 scan = regnext(scan);
987 if (flags & SCF_DO_SUBSTR) {
830247a4 988 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
989 data->longest = &(data->longest_float);
990 }
991 goto optimize_curly_tail;
992 case CURLY:
b81d288d 993 mincount = ARG1(scan);
c277df42
IZ
994 maxcount = ARG2(scan);
995 next = regnext(scan);
cb434fcc
IZ
996 if (OP(scan) == CURLYX) {
997 I32 lp = (data ? *(data->last_closep) : 0);
998
999 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1000 }
c277df42
IZ
1001 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1002 do_curly:
1003 if (flags & SCF_DO_SUBSTR) {
830247a4 1004 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1005 pos_before = data->pos_min;
1006 }
1007 if (data) {
1008 fl = data->flags;
1009 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1010 if (is_inf)
1011 data->flags |= SF_IS_INF;
1012 }
653099ff 1013 if (flags & SCF_DO_STCLASS) {
830247a4 1014 cl_init(pRExC_state, &this_class);
653099ff
GS
1015 oclass = data->start_class;
1016 data->start_class = &this_class;
1017 f |= SCF_DO_STCLASS_AND;
1018 f &= ~SCF_DO_STCLASS_OR;
1019 }
e1901655
IZ
1020 /* These are the cases when once a subexpression
1021 fails at a particular position, it cannot succeed
1022 even after backtracking at the enclosing scope.
b81d288d 1023
e1901655
IZ
1024 XXXX what if minimal match and we are at the
1025 initial run of {n,m}? */
1026 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1027 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1028
c277df42 1029 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1030 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1031 mincount == 0
653099ff
GS
1032 ? (f & ~SCF_DO_SUBSTR) : f);
1033
1034 if (flags & SCF_DO_STCLASS)
1035 data->start_class = oclass;
1036 if (mincount == 0 || minnext == 0) {
1037 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1038 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1039 }
1040 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1041 /* Switch to OR mode: cache the old value of
653099ff
GS
1042 * data->start_class */
1043 StructCopy(data->start_class, &and_with,
1044 struct regnode_charclass_class);
1045 flags &= ~SCF_DO_STCLASS_AND;
1046 StructCopy(&this_class, data->start_class,
1047 struct regnode_charclass_class);
1048 flags |= SCF_DO_STCLASS_OR;
1049 data->start_class->flags |= ANYOF_EOS;
1050 }
1051 } else { /* Non-zero len */
1052 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1053 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1054 cl_and(data->start_class, &and_with);
1055 }
1056 else if (flags & SCF_DO_STCLASS_AND)
1057 cl_and(data->start_class, &this_class);
1058 flags &= ~SCF_DO_STCLASS;
1059 }
c277df42
IZ
1060 if (!scan) /* It was not CURLYX, but CURLY. */
1061 scan = next;
b81d288d 1062 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
99799961 1063 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1064 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1065 {
830247a4 1066 vWARN(RExC_parse,
b45f050a
JF
1067 "Quantifier unexpected on zero-length expression");
1068 }
1069
c277df42 1070 min += minnext * mincount;
b81d288d 1071 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1072 && (minnext + deltanext) > 0)
1073 || deltanext == I32_MAX);
aca2d497 1074 is_inf |= is_inf_internal;
c277df42
IZ
1075 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1076
1077 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1078 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1079 && data->flags & SF_IN_PAR
1080 && !(data->flags & SF_HAS_EVAL)
1081 && !deltanext && minnext == 1 ) {
1082 /* Try to optimize to CURLYN. */
1083 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1084 regnode *nxt1 = nxt, *nxt2;
1085
1086 /* Skip open. */
1087 nxt = regnext(nxt);
4d61ec05 1088 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1089 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1090 && STR_LEN(nxt) == 1))
c277df42
IZ
1091 goto nogo;
1092 nxt2 = nxt;
1093 nxt = regnext(nxt);
b81d288d 1094 if (OP(nxt) != CLOSE)
c277df42
IZ
1095 goto nogo;
1096 /* Now we know that nxt2 is the only contents: */
1097 oscan->flags = ARG(nxt);
1098 OP(oscan) = CURLYN;
1099 OP(nxt1) = NOTHING; /* was OPEN. */
1100#ifdef DEBUGGING
1101 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1102 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1103 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1104 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1105 OP(nxt + 1) = OPTIMIZED; /* was count. */
1106 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1107#endif
c277df42 1108 }
c277df42
IZ
1109 nogo:
1110
1111 /* Try optimization CURLYX => CURLYM. */
b81d288d 1112 if ( OP(oscan) == CURLYX && data
c277df42 1113 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1114 && !(data->flags & SF_HAS_EVAL)
1115 && !deltanext ) {
1116 /* XXXX How to optimize if data == 0? */
1117 /* Optimize to a simpler form. */
1118 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1119 regnode *nxt2;
1120
1121 OP(oscan) = CURLYM;
1122 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1123 && (OP(nxt2) != WHILEM))
c277df42
IZ
1124 nxt = nxt2;
1125 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1126 /* Need to optimize away parenths. */
1127 if (data->flags & SF_IN_PAR) {
1128 /* Set the parenth number. */
1129 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1130
b81d288d 1131 if (OP(nxt) != CLOSE)
b45f050a 1132 FAIL("Panic opt close");
c277df42
IZ
1133 oscan->flags = ARG(nxt);
1134 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1135 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1136#ifdef DEBUGGING
1137 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1138 OP(nxt + 1) = OPTIMIZED; /* was count. */
1139 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1140 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1141#endif
c277df42
IZ
1142#if 0
1143 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1144 regnode *nnxt = regnext(nxt1);
b81d288d 1145
c277df42
IZ
1146 if (nnxt == nxt) {
1147 if (reg_off_by_arg[OP(nxt1)])
1148 ARG_SET(nxt1, nxt2 - nxt1);
1149 else if (nxt2 - nxt1 < U16_MAX)
1150 NEXT_OFF(nxt1) = nxt2 - nxt1;
1151 else
1152 OP(nxt) = NOTHING; /* Cannot beautify */
1153 }
1154 nxt1 = nnxt;
1155 }
1156#endif
1157 /* Optimize again: */
b81d288d 1158 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1159 NULL, 0);
a0ed51b3
LW
1160 }
1161 else
c277df42 1162 oscan->flags = 0;
c277df42 1163 }
e1901655
IZ
1164 else if ((OP(oscan) == CURLYX)
1165 && (flags & SCF_WHILEM_VISITED_POS)
1166 /* See the comment on a similar expression above.
1167 However, this time it not a subexpression
1168 we care about, but the expression itself. */
1169 && (maxcount == REG_INFTY)
1170 && data && ++data->whilem_c < 16) {
1171 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1172 /* Find WHILEM (as in regexec.c) */
1173 regnode *nxt = oscan + NEXT_OFF(oscan);
1174
1175 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1176 nxt += ARG(nxt);
1177 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1178 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1179 }
b81d288d 1180 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1181 pars++;
1182 if (flags & SCF_DO_SUBSTR) {
1183 SV *last_str = Nullsv;
1184 int counted = mincount != 0;
1185
1186 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
b81d288d 1187 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1188 ? pos_before : data->last_start_min;
1189 STRLEN l;
1190 char *s = SvPV(data->last_found, l);
a0ed51b3
LW
1191 I32 old = b - data->last_start_min;
1192
1193 if (UTF)
1194 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1195
a0ed51b3 1196 l -= old;
c277df42 1197 /* Get the added string: */
79cb57f6 1198 last_str = newSVpvn(s + old, l);
c277df42
IZ
1199 if (deltanext == 0 && pos_before == b) {
1200 /* What was added is a constant string */
1201 if (mincount > 1) {
1202 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1203 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1204 SvPVX(last_str), l, mincount - 1);
1205 SvCUR(last_str) *= mincount;
1206 /* Add additional parts. */
b81d288d 1207 SvCUR_set(data->last_found,
c277df42
IZ
1208 SvCUR(data->last_found) - l);
1209 sv_catsv(data->last_found, last_str);
1210 data->last_end += l * (mincount - 1);
1211 }
2a8d9689
HS
1212 } else {
1213 /* start offset must point into the last copy */
1214 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1215 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1216 * (minnext + data->pos_delta);
c277df42
IZ
1217 }
1218 }
1219 /* It is counted once already... */
1220 data->pos_min += minnext * (mincount - counted);
1221 data->pos_delta += - counted * deltanext +
1222 (minnext + deltanext) * maxcount - minnext * mincount;
1223 if (mincount != maxcount) {
653099ff
GS
1224 /* Cannot extend fixed substrings found inside
1225 the group. */
830247a4 1226 scan_commit(pRExC_state,data);
c277df42
IZ
1227 if (mincount && last_str) {
1228 sv_setsv(data->last_found, last_str);
1229 data->last_end = data->pos_min;
b81d288d 1230 data->last_start_min =
a0ed51b3 1231 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1232 data->last_start_max = is_inf
1233 ? I32_MAX
c277df42 1234 : data->pos_min + data->pos_delta
a0ed51b3 1235 - CHR_SVLEN(last_str);
c277df42
IZ
1236 }
1237 data->longest = &(data->longest_float);
1238 }
aca2d497 1239 SvREFCNT_dec(last_str);
c277df42 1240 }
405ff068 1241 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1242 data->flags |= SF_HAS_EVAL;
1243 optimize_curly_tail:
c277df42 1244 if (OP(oscan) != CURLYX) {
22c35a8c 1245 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1246 && NEXT_OFF(next))
1247 NEXT_OFF(oscan) += NEXT_OFF(next);
1248 }
c277df42 1249 continue;
653099ff 1250 default: /* REF and CLUMP only? */
c277df42 1251 if (flags & SCF_DO_SUBSTR) {
830247a4 1252 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1253 data->longest = &(data->longest_float);
1254 }
aca2d497 1255 is_inf = is_inf_internal = 1;
653099ff 1256 if (flags & SCF_DO_STCLASS_OR)
830247a4 1257 cl_anything(pRExC_state, data->start_class);
653099ff 1258 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1259 break;
1260 }
a0ed51b3 1261 }
ffc61ed2 1262 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1263 int value = 0;
653099ff 1264
c277df42 1265 if (flags & SCF_DO_SUBSTR) {
830247a4 1266 scan_commit(pRExC_state,data);
c277df42
IZ
1267 data->pos_min++;
1268 }
1269 min++;
653099ff
GS
1270 if (flags & SCF_DO_STCLASS) {
1271 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1272
1273 /* Some of the logic below assumes that switching
1274 locale on will only add false positives. */
1275 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1276 case SANY:
653099ff
GS
1277 default:
1278 do_default:
1279 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1280 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1281 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1282 break;
1283 case REG_ANY:
1284 if (OP(scan) == SANY)
1285 goto do_default;
1286 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1287 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1288 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1289 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1290 }
1291 if (flags & SCF_DO_STCLASS_AND || !value)
1292 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1293 break;
1294 case ANYOF:
1295 if (flags & SCF_DO_STCLASS_AND)
1296 cl_and(data->start_class,
1297 (struct regnode_charclass_class*)scan);
1298 else
830247a4 1299 cl_or(pRExC_state, data->start_class,
653099ff
GS
1300 (struct regnode_charclass_class*)scan);
1301 break;
1302 case ALNUM:
1303 if (flags & SCF_DO_STCLASS_AND) {
1304 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1305 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1306 for (value = 0; value < 256; value++)
1307 if (!isALNUM(value))
1308 ANYOF_BITMAP_CLEAR(data->start_class, value);
1309 }
1310 }
1311 else {
1312 if (data->start_class->flags & ANYOF_LOCALE)
1313 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1314 else {
1315 for (value = 0; value < 256; value++)
1316 if (isALNUM(value))
b81d288d 1317 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1318 }
1319 }
1320 break;
1321 case ALNUML:
1322 if (flags & SCF_DO_STCLASS_AND) {
1323 if (data->start_class->flags & ANYOF_LOCALE)
1324 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1325 }
1326 else {
1327 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1328 data->start_class->flags |= ANYOF_LOCALE;
1329 }
1330 break;
1331 case NALNUM:
1332 if (flags & SCF_DO_STCLASS_AND) {
1333 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1334 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1335 for (value = 0; value < 256; value++)
1336 if (isALNUM(value))
1337 ANYOF_BITMAP_CLEAR(data->start_class, value);
1338 }
1339 }
1340 else {
1341 if (data->start_class->flags & ANYOF_LOCALE)
1342 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1343 else {
1344 for (value = 0; value < 256; value++)
1345 if (!isALNUM(value))
b81d288d 1346 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1347 }
1348 }
1349 break;
1350 case NALNUML:
1351 if (flags & SCF_DO_STCLASS_AND) {
1352 if (data->start_class->flags & ANYOF_LOCALE)
1353 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1354 }
1355 else {
1356 data->start_class->flags |= ANYOF_LOCALE;
1357 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1358 }
1359 break;
1360 case SPACE:
1361 if (flags & SCF_DO_STCLASS_AND) {
1362 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1363 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1364 for (value = 0; value < 256; value++)
1365 if (!isSPACE(value))
1366 ANYOF_BITMAP_CLEAR(data->start_class, value);
1367 }
1368 }
1369 else {
1370 if (data->start_class->flags & ANYOF_LOCALE)
1371 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1372 else {
1373 for (value = 0; value < 256; value++)
1374 if (isSPACE(value))
b81d288d 1375 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1376 }
1377 }
1378 break;
1379 case SPACEL:
1380 if (flags & SCF_DO_STCLASS_AND) {
1381 if (data->start_class->flags & ANYOF_LOCALE)
1382 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1383 }
1384 else {
1385 data->start_class->flags |= ANYOF_LOCALE;
1386 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1387 }
1388 break;
1389 case NSPACE:
1390 if (flags & SCF_DO_STCLASS_AND) {
1391 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1392 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1393 for (value = 0; value < 256; value++)
1394 if (isSPACE(value))
1395 ANYOF_BITMAP_CLEAR(data->start_class, value);
1396 }
1397 }
1398 else {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1401 else {
1402 for (value = 0; value < 256; value++)
1403 if (!isSPACE(value))
b81d288d 1404 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1405 }
1406 }
1407 break;
1408 case NSPACEL:
1409 if (flags & SCF_DO_STCLASS_AND) {
1410 if (data->start_class->flags & ANYOF_LOCALE) {
1411 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1412 for (value = 0; value < 256; value++)
1413 if (!isSPACE(value))
1414 ANYOF_BITMAP_CLEAR(data->start_class, value);
1415 }
1416 }
1417 else {
1418 data->start_class->flags |= ANYOF_LOCALE;
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1420 }
1421 break;
1422 case DIGIT:
1423 if (flags & SCF_DO_STCLASS_AND) {
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1425 for (value = 0; value < 256; value++)
1426 if (!isDIGIT(value))
1427 ANYOF_BITMAP_CLEAR(data->start_class, value);
1428 }
1429 else {
1430 if (data->start_class->flags & ANYOF_LOCALE)
1431 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1432 else {
1433 for (value = 0; value < 256; value++)
1434 if (isDIGIT(value))
b81d288d 1435 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1436 }
1437 }
1438 break;
1439 case NDIGIT:
1440 if (flags & SCF_DO_STCLASS_AND) {
1441 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1442 for (value = 0; value < 256; value++)
1443 if (isDIGIT(value))
1444 ANYOF_BITMAP_CLEAR(data->start_class, value);
1445 }
1446 else {
1447 if (data->start_class->flags & ANYOF_LOCALE)
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1449 else {
1450 for (value = 0; value < 256; value++)
1451 if (!isDIGIT(value))
b81d288d 1452 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1453 }
1454 }
1455 break;
1456 }
1457 if (flags & SCF_DO_STCLASS_OR)
1458 cl_and(data->start_class, &and_with);
1459 flags &= ~SCF_DO_STCLASS;
1460 }
a0ed51b3 1461 }
22c35a8c 1462 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1463 data->flags |= (OP(scan) == MEOL
1464 ? SF_BEFORE_MEOL
1465 : SF_BEFORE_SEOL);
a0ed51b3 1466 }
653099ff
GS
1467 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1468 /* Lookbehind, or need to calculate parens/evals/stclass: */
1469 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1470 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1471 /* Lookahead/lookbehind */
cb434fcc 1472 I32 deltanext, minnext, fake = 0;
c277df42 1473 regnode *nscan;
653099ff
GS
1474 struct regnode_charclass_class intrnl;
1475 int f = 0;
c277df42
IZ
1476
1477 data_fake.flags = 0;
b81d288d 1478 if (data) {
2c2d71f5 1479 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1480 data_fake.last_closep = data->last_closep;
1481 }
1482 else
1483 data_fake.last_closep = &fake;
653099ff
GS
1484 if ( flags & SCF_DO_STCLASS && !scan->flags
1485 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1486 cl_init(pRExC_state, &intrnl);
653099ff 1487 data_fake.start_class = &intrnl;
e1901655 1488 f |= SCF_DO_STCLASS_AND;
653099ff 1489 }
e1901655
IZ
1490 if (flags & SCF_WHILEM_VISITED_POS)
1491 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1492 next = regnext(scan);
1493 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1494 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1495 if (scan->flags) {
1496 if (deltanext) {
9baa0206 1497 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1498 }
1499 else if (minnext > U8_MAX) {
9baa0206 1500 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1501 }
1502 scan->flags = minnext;
1503 }
1504 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1505 pars++;
405ff068 1506 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1507 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1508 if (data)
1509 data->whilem_c = data_fake.whilem_c;
e1901655 1510 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1511 int was = (data->start_class->flags & ANYOF_EOS);
1512
1513 cl_and(data->start_class, &intrnl);
1514 if (was)
1515 data->start_class->flags |= ANYOF_EOS;
1516 }
a0ed51b3
LW
1517 }
1518 else if (OP(scan) == OPEN) {
c277df42 1519 pars++;
a0ed51b3 1520 }
cb434fcc
IZ
1521 else if (OP(scan) == CLOSE) {
1522 if (ARG(scan) == is_par) {
1523 next = regnext(scan);
c277df42 1524
cb434fcc
IZ
1525 if ( next && (OP(next) != WHILEM) && next < last)
1526 is_par = 0; /* Disable optimization */
1527 }
1528 if (data)
1529 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1530 }
1531 else if (OP(scan) == EVAL) {
c277df42
IZ
1532 if (data)
1533 data->flags |= SF_HAS_EVAL;
1534 }
96776eda 1535 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1536 if (flags & SCF_DO_SUBSTR) {
830247a4 1537 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1538 data->longest = &(data->longest_float);
1539 }
1540 is_inf = is_inf_internal = 1;
653099ff 1541 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1542 cl_anything(pRExC_state, data->start_class);
96776eda 1543 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1544 }
c277df42
IZ
1545 /* Else: zero-length, ignore. */
1546 scan = regnext(scan);
1547 }
1548
1549 finish:
1550 *scanp = scan;
aca2d497 1551 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1552 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1553 data->pos_delta = I32_MAX - data->pos_min;
1554 if (is_par > U8_MAX)
1555 is_par = 0;
1556 if (is_par && pars==1 && data) {
1557 data->flags |= SF_IN_PAR;
1558 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1559 }
1560 else if (pars && data) {
c277df42
IZ
1561 data->flags |= SF_HAS_PAR;
1562 data->flags &= ~SF_IN_PAR;
1563 }
653099ff
GS
1564 if (flags & SCF_DO_STCLASS_OR)
1565 cl_and(data->start_class, &and_with);
c277df42
IZ
1566 return min;
1567}
1568
76e3520e 1569STATIC I32
830247a4 1570S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1571{
830247a4 1572 if (RExC_rx->data) {
b81d288d
AB
1573 Renewc(RExC_rx->data,
1574 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1575 char, struct reg_data);
830247a4
IZ
1576 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1577 RExC_rx->data->count += n;
a0ed51b3
LW
1578 }
1579 else {
830247a4 1580 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1581 char, struct reg_data);
830247a4
IZ
1582 New(1208, RExC_rx->data->what, n, U8);
1583 RExC_rx->data->count = n;
c277df42 1584 }
830247a4
IZ
1585 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1586 return RExC_rx->data->count - n;
c277df42
IZ
1587}
1588
d88dccdf 1589void
864dbfa3 1590Perl_reginitcolors(pTHX)
d88dccdf 1591{
d88dccdf
IZ
1592 int i = 0;
1593 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1594
d88dccdf
IZ
1595 if (s) {
1596 PL_colors[0] = s = savepv(s);
1597 while (++i < 6) {
1598 s = strchr(s, '\t');
1599 if (s) {
1600 *s = '\0';
1601 PL_colors[i] = ++s;
1602 }
1603 else
c712d376 1604 PL_colors[i] = s = "";
d88dccdf
IZ
1605 }
1606 } else {
b81d288d 1607 while (i < 6)
d88dccdf
IZ
1608 PL_colors[i++] = "";
1609 }
1610 PL_colorset = 1;
1611}
1612
8615cb43 1613
a687059c 1614/*
e50aee73 1615 - pregcomp - compile a regular expression into internal code
a687059c
LW
1616 *
1617 * We can't allocate space until we know how big the compiled form will be,
1618 * but we can't compile it (and thus know how big it is) until we've got a
1619 * place to put the code. So we cheat: we compile it twice, once with code
1620 * generation turned off and size counting turned on, and once "for real".
1621 * This also means that we don't allocate space until we are sure that the
1622 * thing really will compile successfully, and we never have to move the
1623 * code and thus invalidate pointers into it. (Note that it has to be in
1624 * one piece because free() must be able to free it all.) [NB: not true in perl]
1625 *
1626 * Beware that the optimization-preparation code in here knows about some
1627 * of the structure of the compiled regexp. [I'll say.]
1628 */
1629regexp *
864dbfa3 1630Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1631{
a0d0e21e 1632 register regexp *r;
c277df42 1633 regnode *scan;
c277df42 1634 regnode *first;
a0d0e21e 1635 I32 flags;
a0d0e21e
LW
1636 I32 minlen = 0;
1637 I32 sawplus = 0;
1638 I32 sawopen = 0;
2c2d71f5 1639 scan_data_t data;
830247a4
IZ
1640 RExC_state_t RExC_state;
1641 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1642
1643 if (exp == NULL)
c277df42 1644 FAIL("NULL regexp argument");
a0d0e21e 1645
830247a4 1646 /* XXXX This looks very suspicious... */
1aa99e6b
IH
1647 if (pm->op_pmdynflags & PMdf_CMP_UTF8)
1648 RExC_utf8 = 1;
a0ed51b3 1649 else
1aa99e6b 1650 RExC_utf8 = 0;
a0ed51b3 1651
5cfc7842 1652 RExC_precomp = exp;
35ef4773 1653 DEBUG_r(if (!PL_colorset) reginitcolors());
2c2d71f5 1654 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
d88dccdf 1655 PL_colors[4],PL_colors[5],PL_colors[0],
830247a4
IZ
1656 (int)(xend - exp), RExC_precomp, PL_colors[1]));
1657 RExC_flags16 = pm->op_pmflags;
1658 RExC_sawback = 0;
bbce6d69 1659
830247a4
IZ
1660 RExC_seen = 0;
1661 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1662 RExC_seen_evals = 0;
1663 RExC_extralen = 0;
c277df42 1664
bbce6d69 1665 /* First pass: determine size, legality. */
830247a4 1666 RExC_parse = exp;
fac92740 1667 RExC_start = exp;
830247a4
IZ
1668 RExC_end = xend;
1669 RExC_naughty = 0;
1670 RExC_npar = 1;
1671 RExC_size = 0L;
1672 RExC_emit = &PL_regdummy;
1673 RExC_whilem_seen = 0;
85ddcde9
JH
1674#if 0 /* REGC() is (currently) a NOP at the first pass.
1675 * Clever compilers notice this and complain. --jhi */
830247a4 1676 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1677#endif
830247a4 1678 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1679 RExC_precomp = Nullch;
a0d0e21e
LW
1680 return(NULL);
1681 }
830247a4 1682 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1683
c277df42
IZ
1684 /* Small enough for pointer-storage convention?
1685 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1686 if (RExC_size >= 0x10000L && RExC_extralen)
1687 RExC_size += RExC_extralen;
c277df42 1688 else
830247a4
IZ
1689 RExC_extralen = 0;
1690 if (RExC_whilem_seen > 15)
1691 RExC_whilem_seen = 15;
a0d0e21e 1692
bbce6d69 1693 /* Allocate space and initialize. */
830247a4 1694 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1695 char, regexp);
a0d0e21e 1696 if (r == NULL)
b45f050a
JF
1697 FAIL("Regexp out of space");
1698
0f79a09d
GS
1699#ifdef DEBUGGING
1700 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1701 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1702#endif
c277df42 1703 r->refcnt = 1;
bbce6d69 1704 r->prelen = xend - exp;
5cfc7842 1705 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1706 r->subbeg = NULL;
1707 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1708 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1709
1710 r->substrs = 0; /* Useful during FAIL. */
1711 r->startp = 0; /* Useful during FAIL. */
1712 r->endp = 0; /* Useful during FAIL. */
1713
fac92740
MJD
1714 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1715 if (r->offsets) {
1716 r->offsets[0] = RExC_size;
1717 }
1718 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1719 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1720 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1721 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1722
830247a4 1723 RExC_rx = r;
bbce6d69
PP
1724
1725 /* Second pass: emit code. */
830247a4
IZ
1726 RExC_parse = exp;
1727 RExC_end = xend;
1728 RExC_naughty = 0;
1729 RExC_npar = 1;
fac92740 1730 RExC_emit_start = r->program;
830247a4 1731 RExC_emit = r->program;
2cd61cdb 1732 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1733 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1734 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1735 r->data = 0;
830247a4 1736 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1737 return(NULL);
1738
1739 /* Dig out information for optimizations. */
cf93c79d 1740 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1741 pm->op_pmflags = RExC_flags16;
a0ed51b3
LW
1742 if (UTF)
1743 r->reganch |= ROPT_UTF8;
c277df42 1744 r->regstclass = NULL;
830247a4 1745 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1746 r->reganch |= ROPT_NAUGHTY;
c277df42 1747 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1748
1749 /* XXXX To minimize changes to RE engine we always allocate
1750 3-units-long substrs field. */
1751 Newz(1004, r->substrs, 1, struct reg_substr_data);
1752
2c2d71f5 1753 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1754 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1755 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1756 I32 fake;
c5254dd6 1757 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1758 struct regnode_charclass_class ch_class;
1759 int stclass_flag;
cb434fcc 1760 I32 last_close = 0;
a0d0e21e
LW
1761
1762 first = scan;
c277df42 1763 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1764 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1765 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1766 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1767 (OP(first) == PLUS) ||
1768 (OP(first) == MINMOD) ||
653099ff 1769 /* An {n,m} with n>0 */
22c35a8c 1770 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1771 if (OP(first) == PLUS)
1772 sawplus = 1;
1773 else
1774 first += regarglen[(U8)OP(first)];
1775 first = NEXTOPER(first);
a687059c
LW
1776 }
1777
a0d0e21e
LW
1778 /* Starting-point info. */
1779 again:
653099ff 1780 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1781 if (OP(first) == EXACT)
1782 ; /* Empty, get anchored substr later. */
1783 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1784 r->regstclass = first;
1785 }
653099ff 1786 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1787 r->regstclass = first;
22c35a8c
GS
1788 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1789 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1790 r->regstclass = first;
22c35a8c 1791 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1792 r->reganch |= (OP(first) == MBOL
1793 ? ROPT_ANCH_MBOL
1794 : (OP(first) == SBOL
1795 ? ROPT_ANCH_SBOL
1796 : ROPT_ANCH_BOL));
a0d0e21e 1797 first = NEXTOPER(first);
774d564b
PP
1798 goto again;
1799 }
1800 else if (OP(first) == GPOS) {
1801 r->reganch |= ROPT_ANCH_GPOS;
1802 first = NEXTOPER(first);
1803 goto again;
a0d0e21e
LW
1804 }
1805 else if ((OP(first) == STAR &&
22c35a8c 1806 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1807 !(r->reganch & ROPT_ANCH) )
1808 {
1809 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1810 int type = OP(NEXTOPER(first));
1811
ffc61ed2 1812 if (type == REG_ANY)
cad2e5aa
JH
1813 type = ROPT_ANCH_MBOL;
1814 else
1815 type = ROPT_ANCH_SBOL;
1816
1817 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1818 first = NEXTOPER(first);
774d564b 1819 goto again;
a0d0e21e 1820 }
b81d288d 1821 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1822 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1823 /* x+ must match at the 1st pos of run of x's */
1824 r->reganch |= ROPT_SKIP;
a0d0e21e 1825
c277df42 1826 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1827 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1828 (IV)(first - scan + 1)));
a0d0e21e
LW
1829 /*
1830 * If there's something expensive in the r.e., find the
1831 * longest literal string that must appear and make it the
1832 * regmust. Resolve ties in favor of later strings, since
1833 * the regstart check works with the beginning of the r.e.
1834 * and avoiding duplication strengthens checking. Not a
1835 * strong reason, but sufficient in the absence of others.
1836 * [Now we resolve ties in favor of the earlier string if
c277df42 1837 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1838 * earlier string may buy us something the later one won't.]
1839 */
a0d0e21e 1840 minlen = 0;
a687059c 1841
79cb57f6
GS
1842 data.longest_fixed = newSVpvn("",0);
1843 data.longest_float = newSVpvn("",0);
1844 data.last_found = newSVpvn("",0);
c277df42
IZ
1845 data.longest = &(data.longest_fixed);
1846 first = scan;
653099ff 1847 if (!r->regstclass) {
830247a4 1848 cl_init(pRExC_state, &ch_class);
653099ff
GS
1849 data.start_class = &ch_class;
1850 stclass_flag = SCF_DO_STCLASS_AND;
1851 } else /* XXXX Check for BOUND? */
1852 stclass_flag = 0;
cb434fcc 1853 data.last_closep = &last_close;
653099ff 1854
830247a4 1855 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1856 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1857 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1858 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1859 && !RExC_seen_zerolen
1860 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1861 r->reganch |= ROPT_CHECK_ALL;
830247a4 1862 scan_commit(pRExC_state, &data);
c277df42
IZ
1863 SvREFCNT_dec(data.last_found);
1864
a0ed51b3 1865 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1866 if (longest_float_length
c277df42
IZ
1867 || (data.flags & SF_FL_BEFORE_EOL
1868 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1869 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1870 int t;
1871
a0ed51b3 1872 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1873 && data.offset_fixed == data.offset_float_min
1874 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1875 goto remove_float; /* As in (a)+. */
1876
c277df42
IZ
1877 r->float_substr = data.longest_float;
1878 r->float_min_offset = data.offset_float_min;
1879 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1880 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1881 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1882 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1883 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1884 }
1885 else {
aca2d497 1886 remove_float:
c277df42
IZ
1887 r->float_substr = Nullsv;
1888 SvREFCNT_dec(data.longest_float);
c5254dd6 1889 longest_float_length = 0;
a0d0e21e 1890 }
c277df42 1891
a0ed51b3 1892 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1893 if (longest_fixed_length
c277df42
IZ
1894 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1895 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1896 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1897 int t;
1898
c277df42
IZ
1899 r->anchored_substr = data.longest_fixed;
1900 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1901 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1902 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1903 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1904 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1905 }
1906 else {
c277df42
IZ
1907 r->anchored_substr = Nullsv;
1908 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1909 longest_fixed_length = 0;
a0d0e21e 1910 }
b81d288d 1911 if (r->regstclass
ffc61ed2 1912 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
1913 r->regstclass = NULL;
1914 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1915 && !(data.start_class->flags & ANYOF_EOS)
1916 && !cl_is_anything(data.start_class)) {
830247a4 1917 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1918
b81d288d 1919 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1920 struct regnode_charclass_class);
1921 StructCopy(data.start_class,
830247a4 1922 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1923 struct regnode_charclass_class);
830247a4 1924 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1925 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 1926 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
1927 DEBUG_r({ SV *sv = sv_newmortal();
1928 regprop(sv, (regnode*)data.start_class);
1929 PerlIO_printf(Perl_debug_log,
1930 "synthetic stclass `%s'.\n",
1931 SvPVX(sv));});
653099ff 1932 }
c277df42
IZ
1933
1934 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 1935 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
1936 r->check_substr = r->anchored_substr;
1937 r->check_offset_min = r->check_offset_max = r->anchored_offset;
1938 if (r->reganch & ROPT_ANCH_SINGLE)
1939 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
1940 }
1941 else {
c277df42
IZ
1942 r->check_substr = r->float_substr;
1943 r->check_offset_min = data.offset_float_min;
1944 r->check_offset_max = data.offset_float_max;
a0d0e21e 1945 }
30382c73
IZ
1946 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1947 This should be changed ASAP! */
1948 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
1949 r->reganch |= RE_USE_INTUIT;
1950 if (SvTAIL(r->check_substr))
1951 r->reganch |= RE_INTUIT_TAIL;
1952 }
a0ed51b3
LW
1953 }
1954 else {
c277df42
IZ
1955 /* Several toplevels. Best we can is to set minlen. */
1956 I32 fake;
653099ff 1957 struct regnode_charclass_class ch_class;
cb434fcc 1958 I32 last_close = 0;
c277df42
IZ
1959
1960 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1961 scan = r->program + 1;
830247a4 1962 cl_init(pRExC_state, &ch_class);
653099ff 1963 data.start_class = &ch_class;
cb434fcc 1964 data.last_closep = &last_close;
e1901655 1965 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 1966 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
1967 if (!(data.start_class->flags & ANYOF_EOS)
1968 && !cl_is_anything(data.start_class)) {
830247a4 1969 I32 n = add_data(pRExC_state, 1, "f");
653099ff 1970
b81d288d 1971 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
1972 struct regnode_charclass_class);
1973 StructCopy(data.start_class,
830247a4 1974 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 1975 struct regnode_charclass_class);
830247a4 1976 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 1977 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
1978 DEBUG_r({ SV* sv = sv_newmortal();
1979 regprop(sv, (regnode*)data.start_class);
1980 PerlIO_printf(Perl_debug_log,
1981 "synthetic stclass `%s'.\n",
1982 SvPVX(sv));});
653099ff 1983 }
a0d0e21e
LW
1984 }
1985
a0d0e21e 1986 r->minlen = minlen;
b81d288d 1987 if (RExC_seen & REG_SEEN_GPOS)
c277df42 1988 r->reganch |= ROPT_GPOS_SEEN;
830247a4 1989 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 1990 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 1991 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 1992 r->reganch |= ROPT_EVAL_SEEN;
3baa4c62
JH
1993 if (RExC_seen & REG_SEEN_SANY)
1994 r->reganch |= ROPT_SANY_SEEN;
830247a4
IZ
1995 Newz(1002, r->startp, RExC_npar, I32);
1996 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 1997 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
1998 DEBUG_r(regdump(r));
1999 return(r);
a687059c
LW
2000}
2001
2002/*
2003 - reg - regular expression, i.e. main body or parenthesized thing
2004 *
2005 * Caller must absorb opening parenthesis.
2006 *
2007 * Combining parenthesis handling with the base level of regular expression
2008 * is a trifle forced, but the need to tie the tails of the branches to what
2009 * follows makes it hard to avoid.
2010 */
76e3520e 2011STATIC regnode *
830247a4 2012S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2013 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2014{
c277df42
IZ
2015 register regnode *ret; /* Will be the head of the group. */
2016 register regnode *br;
2017 register regnode *lastbr;
2018 register regnode *ender = 0;
a0d0e21e 2019 register I32 parno = 0;
830247a4 2020 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
fac92740 2021 char * parse_start = RExC_parse; /* MJD */
830247a4 2022 char *oregcomp_parse = RExC_parse;
c277df42 2023 char c;
a0d0e21e 2024
821b33a5 2025 *flagp = 0; /* Tentatively. */
a0d0e21e
LW
2026
2027 /* Make an OPEN node, if parenthesized. */
2028 if (paren) {
fac92740 2029 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2030 U16 posflags = 0, negflags = 0;
2031 U16 *flagsp = &posflags;
0f5d15d6 2032 int logical = 0;
830247a4 2033 char *seqstart = RExC_parse;
ca9dfc88 2034
830247a4
IZ
2035 RExC_parse++;
2036 paren = *RExC_parse++;
c277df42 2037 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2038 switch (paren) {
fac92740 2039 case '<': /* (?<...) */
830247a4 2040 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2041 if (*RExC_parse == '!')
c277df42 2042 paren = ',';
b81d288d 2043 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2044 goto unknown;
830247a4 2045 RExC_parse++;
fac92740
MJD
2046 case '=': /* (?=...) */
2047 case '!': /* (?!...) */
830247a4 2048 RExC_seen_zerolen++;
fac92740
MJD
2049 case ':': /* (?:...) */
2050 case '>': /* (?>...) */
a0d0e21e 2051 break;
fac92740
MJD
2052 case '$': /* (?$...) */
2053 case '@': /* (?@...) */
8615cb43 2054 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2055 break;
fac92740 2056 case '#': /* (?#...) */
830247a4
IZ
2057 while (*RExC_parse && *RExC_parse != ')')
2058 RExC_parse++;
2059 if (*RExC_parse != ')')
c277df42 2060 FAIL("Sequence (?#... not terminated");
830247a4 2061 nextchar(pRExC_state);
a0d0e21e
LW
2062 *flagp = TRYAGAIN;
2063 return NULL;
fac92740 2064 case 'p': /* (?p...) */
d7e9e385 2065 if (SIZE_ONLY)
830247a4 2066 vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2067 /* FALL THROUGH*/
fac92740 2068 case '?': /* (??...) */
0f5d15d6 2069 logical = 1;
830247a4 2070 paren = *RExC_parse++;
0f5d15d6 2071 /* FALL THROUGH */
fac92740 2072 case '{': /* (?{...}) */
c277df42 2073 {
c277df42
IZ
2074 I32 count = 1, n = 0;
2075 char c;
830247a4 2076 char *s = RExC_parse;
c277df42
IZ
2077 SV *sv;
2078 OP_4tree *sop, *rop;
2079
830247a4
IZ
2080 RExC_seen_zerolen++;
2081 RExC_seen |= REG_SEEN_EVAL;
2082 while (count && (c = *RExC_parse)) {
2083 if (c == '\\' && RExC_parse[1])
2084 RExC_parse++;
b81d288d 2085 else if (c == '{')
c277df42 2086 count++;
b81d288d 2087 else if (c == '}')
c277df42 2088 count--;
830247a4 2089 RExC_parse++;
c277df42 2090 }
830247a4 2091 if (*RExC_parse != ')')
b45f050a 2092 {
b81d288d 2093 RExC_parse = s;
b45f050a
JF
2094 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2095 }
c277df42
IZ
2096 if (!SIZE_ONLY) {
2097 AV *av;
b81d288d
AB
2098
2099 if (RExC_parse - 1 - s)
830247a4 2100 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2101 else
79cb57f6 2102 sv = newSVpvn("", 0);
c277df42 2103
569233ed
SB
2104 ENTER;
2105 Perl_save_re_context(aTHX);
c277df42 2106 rop = sv_compile_2op(sv, &sop, "re", &av);
569233ed 2107 LEAVE;
c277df42 2108
830247a4
IZ
2109 n = add_data(pRExC_state, 3, "nop");
2110 RExC_rx->data->data[n] = (void*)rop;
2111 RExC_rx->data->data[n+1] = (void*)sop;
2112 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2113 SvREFCNT_dec(sv);
a0ed51b3 2114 }
e24b16f9 2115 else { /* First pass */
830247a4 2116 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2117 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2118 /* No compiled RE interpolated, has runtime
2119 components ===> unsafe. */
2120 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2121 if (PL_tainting && PL_tainted)
cc6b7395 2122 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2123 }
2124
830247a4 2125 nextchar(pRExC_state);
0f5d15d6 2126 if (logical) {
830247a4 2127 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2128 if (!SIZE_ONLY)
2129 ret->flags = 2;
830247a4 2130 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2131 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2132 return ret;
2133 }
830247a4 2134 return reganode(pRExC_state, EVAL, n);
c277df42 2135 }
fac92740 2136 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2137 {
fac92740 2138 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2139 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2140 || RExC_parse[1] == '<'
830247a4 2141 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2142 I32 flag;
2143
830247a4 2144 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2145 if (!SIZE_ONLY)
2146 ret->flags = 1;
830247a4 2147 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2148 goto insert_if;
b81d288d 2149 }
a0ed51b3 2150 }
830247a4 2151 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2152 /* (?(1)...) */
830247a4 2153 parno = atoi(RExC_parse++);
c277df42 2154
830247a4
IZ
2155 while (isDIGIT(*RExC_parse))
2156 RExC_parse++;
fac92740
MJD
2157 ret = reganode(pRExC_state, GROUPP, parno);
2158
830247a4 2159 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2160 vFAIL("Switch condition not recognized");
c277df42 2161 insert_if:
830247a4
IZ
2162 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2163 br = regbranch(pRExC_state, &flags, 1);
c277df42 2164 if (br == NULL)
830247a4 2165 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2166 else
830247a4
IZ
2167 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2168 c = *nextchar(pRExC_state);
d1b80229
IZ
2169 if (flags&HASWIDTH)
2170 *flagp |= HASWIDTH;
c277df42 2171 if (c == '|') {
830247a4
IZ
2172 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2173 regbranch(pRExC_state, &flags, 1);
2174 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2175 if (flags&HASWIDTH)
2176 *flagp |= HASWIDTH;
830247a4 2177 c = *nextchar(pRExC_state);
a0ed51b3
LW
2178 }
2179 else
c277df42
IZ
2180 lastbr = NULL;
2181 if (c != ')')
8615cb43 2182 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2183 ender = reg_node(pRExC_state, TAIL);
2184 regtail(pRExC_state, br, ender);
c277df42 2185 if (lastbr) {
830247a4
IZ
2186 regtail(pRExC_state, lastbr, ender);
2187 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2188 }
2189 else
830247a4 2190 regtail(pRExC_state, ret, ender);
c277df42 2191 return ret;
a0ed51b3
LW
2192 }
2193 else {
830247a4 2194 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2195 }
2196 }
1b1626e4 2197 case 0:
830247a4 2198 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2199 vFAIL("Sequence (? incomplete");
1b1626e4 2200 break;
a0d0e21e 2201 default:
830247a4 2202 --RExC_parse;
fac92740 2203 parse_flags: /* (?i) */
830247a4
IZ
2204 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2205 if (*RExC_parse != 'o')
2206 pmflag(flagsp, *RExC_parse);
2207 ++RExC_parse;
ca9dfc88 2208 }
830247a4 2209 if (*RExC_parse == '-') {
ca9dfc88 2210 flagsp = &negflags;
830247a4 2211 ++RExC_parse;
ca9dfc88 2212 goto parse_flags;
48c036b1 2213 }
830247a4
IZ
2214 RExC_flags16 |= posflags;
2215 RExC_flags16 &= ~negflags;
2216 if (*RExC_parse == ':') {
2217 RExC_parse++;
ca9dfc88
IZ
2218 paren = ':';
2219 break;
2220 }
c277df42 2221 unknown:
830247a4
IZ
2222 if (*RExC_parse != ')') {
2223 RExC_parse++;
2224 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2225 }
830247a4 2226 nextchar(pRExC_state);
a0d0e21e
LW
2227 *flagp = TRYAGAIN;
2228 return NULL;
2229 }
2230 }
fac92740 2231 else { /* (...) */
830247a4
IZ
2232 parno = RExC_npar;
2233 RExC_npar++;
2234 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2235 Set_Node_Length(ret, 1); /* MJD */
2236 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2237 open = 1;
a0d0e21e 2238 }
a0ed51b3 2239 }
fac92740 2240 else /* ! paren */
a0d0e21e
LW
2241 ret = NULL;
2242
2243 /* Pick up the branches, linking them together. */
fac92740 2244 parse_start = RExC_parse; /* MJD */
830247a4 2245 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2246 /* branch_len = (paren != 0); */
2247
a0d0e21e
LW
2248 if (br == NULL)
2249 return(NULL);
830247a4
IZ
2250 if (*RExC_parse == '|') {
2251 if (!SIZE_ONLY && RExC_extralen) {
2252 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2253 }
fac92740 2254 else { /* MJD */
830247a4 2255 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2256 Set_Node_Length(br, paren != 0);
2257 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2258 }
c277df42
IZ
2259 have_branch = 1;
2260 if (SIZE_ONLY)
830247a4 2261 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2262 }
2263 else if (paren == ':') {
c277df42
IZ
2264 *flagp |= flags&SIMPLE;
2265 }
2266 if (open) { /* Starts with OPEN. */
830247a4 2267 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2268 }
2269 else if (paren != '?') /* Not Conditional */
a0d0e21e 2270 ret = br;
821b33a5
IZ
2271 if (flags&HASWIDTH)
2272 *flagp |= HASWIDTH;
a0d0e21e 2273 *flagp |= flags&SPSTART;
c277df42 2274 lastbr = br;
830247a4
IZ
2275 while (*RExC_parse == '|') {
2276 if (!SIZE_ONLY && RExC_extralen) {
2277 ender = reganode(pRExC_state, LONGJMP,0);
2278 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2279 }
2280 if (SIZE_ONLY)
830247a4
IZ
2281 RExC_extralen += 2; /* Account for LONGJMP. */
2282 nextchar(pRExC_state);
2283 br = regbranch(pRExC_state, &flags, 0);
fac92740 2284
a687059c 2285 if (br == NULL)
a0d0e21e 2286 return(NULL);
830247a4 2287 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2288 lastbr = br;
821b33a5
IZ
2289 if (flags&HASWIDTH)
2290 *flagp |= HASWIDTH;
a687059c 2291 *flagp |= flags&SPSTART;
a0d0e21e
LW
2292 }
2293
c277df42
IZ
2294 if (have_branch || paren != ':') {
2295 /* Make a closing node, and hook it on the end. */
2296 switch (paren) {
2297 case ':':
830247a4 2298 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2299 break;
2300 case 1:
830247a4 2301 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2302 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2303 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2304 break;
2305 case '<':
c277df42
IZ
2306 case ',':
2307 case '=':
2308 case '!':
c277df42 2309 *flagp &= ~HASWIDTH;
821b33a5
IZ
2310 /* FALL THROUGH */
2311 case '>':
830247a4 2312 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2313 break;
2314 case 0:
830247a4 2315 ender = reg_node(pRExC_state, END);
c277df42
IZ
2316 break;
2317 }
830247a4 2318 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2319
c277df42
IZ
2320 if (have_branch) {
2321 /* Hook the tails of the branches to the closing node. */
2322 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2323 regoptail(pRExC_state, br, ender);
c277df42
IZ
2324 }
2325 }
a0d0e21e 2326 }
c277df42
IZ
2327
2328 {
2329 char *p;
2330 static char parens[] = "=!<,>";
2331
2332 if (paren && (p = strchr(parens, paren))) {
2333 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2334 int flag = (p - parens) > 1;
2335
2336 if (paren == '>')
2337 node = SUSPEND, flag = 0;
830247a4 2338 reginsert(pRExC_state, node,ret);
c277df42 2339 ret->flags = flag;
830247a4 2340 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2341 }
a0d0e21e
LW
2342 }
2343
2344 /* Check for proper termination. */
ce3e6498 2345 if (paren) {
830247a4
IZ
2346 RExC_flags16 = oregflags;
2347 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2348 RExC_parse = oregcomp_parse;
380a0633 2349 vFAIL("Unmatched (");
ce3e6498 2350 }
a0ed51b3 2351 }
830247a4
IZ
2352 else if (!paren && RExC_parse < RExC_end) {
2353 if (*RExC_parse == ')') {
2354 RExC_parse++;
380a0633 2355 vFAIL("Unmatched )");
a0ed51b3
LW
2356 }
2357 else
b45f050a 2358 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2359 /* NOTREACHED */
2360 }
a687059c 2361
a0d0e21e 2362 return(ret);
a687059c
LW
2363}
2364
2365/*
2366 - regbranch - one alternative of an | operator
2367 *
2368 * Implements the concatenation operator.
2369 */
76e3520e 2370STATIC regnode *
830247a4 2371S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2372{
c277df42
IZ
2373 register regnode *ret;
2374 register regnode *chain = NULL;
2375 register regnode *latest;
2376 I32 flags = 0, c = 0;
a0d0e21e 2377
b81d288d 2378 if (first)
c277df42
IZ
2379 ret = NULL;
2380 else {
b81d288d 2381 if (!SIZE_ONLY && RExC_extralen)
830247a4 2382 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2383 else {
830247a4 2384 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2385 Set_Node_Length(ret, 1);
2386 }
c277df42
IZ
2387 }
2388
b81d288d 2389 if (!first && SIZE_ONLY)
830247a4 2390 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2391
c277df42 2392 *flagp = WORST; /* Tentatively. */
a0d0e21e 2393
830247a4
IZ
2394 RExC_parse--;
2395 nextchar(pRExC_state);
2396 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2397 flags &= ~TRYAGAIN;
830247a4 2398 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2399 if (latest == NULL) {
2400 if (flags & TRYAGAIN)
2401 continue;
2402 return(NULL);
a0ed51b3
LW
2403 }
2404 else if (ret == NULL)
c277df42 2405 ret = latest;
a0d0e21e 2406 *flagp |= flags&HASWIDTH;
c277df42 2407 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2408 *flagp |= flags&SPSTART;
2409 else {
830247a4
IZ
2410 RExC_naughty++;
2411 regtail(pRExC_state, chain, latest);
a687059c 2412 }
a0d0e21e 2413 chain = latest;
c277df42
IZ
2414 c++;
2415 }
2416 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2417 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2418 if (ret == NULL)
2419 ret = chain;
2420 }
2421 if (c == 1) {
2422 *flagp |= flags&SIMPLE;
a0d0e21e 2423 }
a687059c 2424
a0d0e21e 2425 return(ret);
a687059c
LW
2426}
2427
2428/*
2429 - regpiece - something followed by possible [*+?]
2430 *
2431 * Note that the branching code sequences used for ? and the general cases
2432 * of * and + are somewhat optimized: they use the same NOTHING node as
2433 * both the endmarker for their branch list and the body of the last branch.
2434 * It might seem that this node could be dispensed with entirely, but the
2435 * endmarker role is not redundant.
2436 */
76e3520e 2437STATIC regnode *
830247a4 2438S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2439{
c277df42 2440 register regnode *ret;
a0d0e21e
LW
2441 register char op;
2442 register char *next;
2443 I32 flags;
830247a4 2444 char *origparse = RExC_parse;
a0d0e21e
LW
2445 char *maxpos;
2446 I32 min;
c277df42 2447 I32 max = REG_INFTY;
fac92740 2448 char *parse_start;
a0d0e21e 2449
830247a4 2450 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2451 if (ret == NULL) {
2452 if (flags & TRYAGAIN)
2453 *flagp |= TRYAGAIN;
2454 return(NULL);
2455 }
2456
830247a4 2457 op = *RExC_parse;
a0d0e21e 2458
830247a4 2459 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2460 parse_start = RExC_parse; /* MJD */
830247a4 2461 next = RExC_parse + 1;
a0d0e21e
LW
2462 maxpos = Nullch;
2463 while (isDIGIT(*next) || *next == ',') {
2464 if (*next == ',') {
2465 if (maxpos)
2466 break;
2467 else
2468 maxpos = next;
a687059c 2469 }
a0d0e21e
LW
2470 next++;
2471 }
2472 if (*next == '}') { /* got one */
2473 if (!maxpos)
2474 maxpos = next;
830247a4
IZ
2475 RExC_parse++;
2476 min = atoi(RExC_parse);
a0d0e21e
LW
2477 if (*maxpos == ',')
2478 maxpos++;
2479 else
830247a4 2480 maxpos = RExC_parse;
a0d0e21e
LW
2481 max = atoi(maxpos);
2482 if (!max && *maxpos != '0')
c277df42
IZ
2483 max = REG_INFTY; /* meaning "infinity" */
2484 else if (max >= REG_INFTY)
8615cb43 2485 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2486 RExC_parse = next;
2487 nextchar(pRExC_state);
a0d0e21e
LW
2488
2489 do_curly:
2490 if ((flags&SIMPLE)) {
830247a4
IZ
2491 RExC_naughty += 2 + RExC_naughty / 2;
2492 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2493 Set_Node_Offset(ret, parse_start+1); /* MJD */
2494 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2495 }
2496 else {
830247a4 2497 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2498
2499 w->flags = 0;
830247a4
IZ
2500 regtail(pRExC_state, ret, w);
2501 if (!SIZE_ONLY && RExC_extralen) {
2502 reginsert(pRExC_state, LONGJMP,ret);
2503 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2504 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2505 }
830247a4 2506 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2507 /* MJD hk */
2508 Set_Node_Offset(ret, parse_start+1);
2509 Set_Node_Length(ret,
2510 op == '{' ? (RExC_parse - parse_start) : 1);
2511
830247a4 2512 if (!SIZE_ONLY && RExC_extralen)
c277df42 2513 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2514 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2515 if (SIZE_ONLY)
830247a4
IZ
2516 RExC_whilem_seen++, RExC_extralen += 3;
2517 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2518 }
c277df42 2519 ret->flags = 0;
a0d0e21e
LW
2520
2521 if (min > 0)
821b33a5
IZ
2522 *flagp = WORST;
2523 if (max > 0)
2524 *flagp |= HASWIDTH;
a0d0e21e 2525 if (max && max < min)
8615cb43 2526 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2527 if (!SIZE_ONLY) {
2528 ARG1_SET(ret, min);
2529 ARG2_SET(ret, max);
a687059c 2530 }
a687059c 2531
a0d0e21e 2532 goto nest_check;
a687059c 2533 }
a0d0e21e 2534 }
a687059c 2535
a0d0e21e
LW
2536 if (!ISMULT1(op)) {
2537 *flagp = flags;
a687059c 2538 return(ret);
a0d0e21e 2539 }
bb20fd44 2540
c277df42 2541#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2542
2543 /* if this is reinstated, don't forget to put this back into perldiag:
2544
2545 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2546
2547 (F) The part of the regexp subject to either the * or + quantifier
2548 could match an empty string. The {#} shows in the regular
2549 expression about where the problem was discovered.
2550
2551 */
2552
bb20fd44 2553 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2554 vFAIL("Regexp *+ operand could be empty");
b81d288d 2555#endif
bb20fd44 2556
fac92740 2557 parse_start = RExC_parse;
830247a4 2558 nextchar(pRExC_state);
a0d0e21e 2559
821b33a5 2560 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2561
2562 if (op == '*' && (flags&SIMPLE)) {
830247a4 2563 reginsert(pRExC_state, STAR, ret);
c277df42 2564 ret->flags = 0;
830247a4 2565 RExC_naughty += 4;
a0d0e21e
LW
2566 }
2567 else if (op == '*') {
2568 min = 0;
2569 goto do_curly;
a0ed51b3
LW
2570 }
2571 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2572 reginsert(pRExC_state, PLUS, ret);
c277df42 2573 ret->flags = 0;
830247a4 2574 RExC_naughty += 3;
a0d0e21e
LW
2575 }
2576 else if (op == '+') {
2577 min = 1;
2578 goto do_curly;
a0ed51b3
LW
2579 }
2580 else if (op == '?') {
a0d0e21e
LW
2581 min = 0; max = 1;
2582 goto do_curly;
2583 }
2584 nest_check:
e476b1b5 2585 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2586 vWARN3(RExC_parse,
b45f050a 2587 "%.*s matches null string many times",
830247a4 2588 RExC_parse - origparse,
b45f050a 2589 origparse);
a0d0e21e
LW
2590 }
2591
830247a4
IZ
2592 if (*RExC_parse == '?') {
2593 nextchar(pRExC_state);
2594 reginsert(pRExC_state, MINMOD, ret);
2595 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2596 }
830247a4
IZ
2597 if (ISMULT2(RExC_parse)) {
2598 RExC_parse++;
b45f050a
JF
2599 vFAIL("Nested quantifiers");
2600 }
a0d0e21e
LW
2601
2602 return(ret);
a687059c
LW
2603}
2604
2605/*
2606 - regatom - the lowest level
2607 *
2608 * Optimization: gobbles an entire sequence of ordinary characters so that
2609 * it can turn them into a single node, which is smaller to store and
2610 * faster to run. Backslashed characters are exceptions, each becoming a
2611 * separate node; the code is simpler that way and it's not worth fixing.
2612 *
b45f050a 2613 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2614STATIC regnode *
830247a4 2615S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2616{
c277df42 2617 register regnode *ret = 0;
a0d0e21e
LW
2618 I32 flags;
2619
2620 *flagp = WORST; /* Tentatively. */
2621
2622tryagain:
830247a4 2623 switch (*RExC_parse) {
a0d0e21e 2624 case '^':
830247a4
IZ
2625 RExC_seen_zerolen++;
2626 nextchar(pRExC_state);
2627 if (RExC_flags16 & PMf_MULTILINE)
2628 ret = reg_node(pRExC_state, MBOL);
2629 else if (RExC_flags16 & PMf_SINGLELINE)
2630 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2631 else
830247a4 2632 ret = reg_node(pRExC_state, BOL);
fac92740 2633 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2634 break;
2635 case '$':
830247a4 2636 nextchar(pRExC_state);
b81d288d 2637 if (*RExC_parse)
830247a4
IZ
2638 RExC_seen_zerolen++;
2639 if (RExC_flags16 & PMf_MULTILINE)
2640 ret = reg_node(pRExC_state, MEOL);
2641 else if (RExC_flags16 & PMf_SINGLELINE)
2642 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2643 else
830247a4 2644 ret = reg_node(pRExC_state, EOL);
fac92740 2645 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2646 break;
2647 case '.':
830247a4 2648 nextchar(pRExC_state);
ffc61ed2
JH
2649 if (RExC_flags16 & PMf_SINGLELINE)
2650 ret = reg_node(pRExC_state, SANY);
2651 else
2652 ret = reg_node(pRExC_state, REG_ANY);
2653 *flagp |= HASWIDTH|SIMPLE;
830247a4 2654 RExC_naughty++;
fac92740 2655 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2656 break;
2657 case '[':
b45f050a 2658 {
830247a4 2659 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2660 ret = regclass(pRExC_state);
830247a4
IZ
2661 if (*RExC_parse != ']') {
2662 RExC_parse = oregcomp_parse;
b45f050a
JF
2663 vFAIL("Unmatched [");
2664 }
830247a4 2665 nextchar(pRExC_state);
a0d0e21e 2666 *flagp |= HASWIDTH|SIMPLE;
fac92740 2667 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2668 break;
b45f050a 2669 }
a0d0e21e 2670 case '(':
830247a4
IZ
2671 nextchar(pRExC_state);
2672 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2673 if (ret == NULL) {
bf93d4cc 2674 if (flags & TRYAGAIN) {
830247a4 2675 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2676 /* Make parent create an empty node if needed. */
2677 *flagp |= TRYAGAIN;
2678 return(NULL);
2679 }
a0d0e21e 2680 goto tryagain;
bf93d4cc 2681 }
a0d0e21e
LW
2682 return(NULL);
2683 }
c277df42 2684 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2685 break;
2686 case '|':
2687 case ')':
2688 if (flags & TRYAGAIN) {
2689 *flagp |= TRYAGAIN;
2690 return NULL;
2691 }
b45f050a 2692 vFAIL("Internal urp");
a0d0e21e
LW
2693 /* Supposed to be caught earlier. */
2694 break;
85afd4ae 2695 case '{':
830247a4
IZ
2696 if (!regcurly(RExC_parse)) {
2697 RExC_parse++;
85afd4ae
CS
2698 goto defchar;
2699 }
2700 /* FALL THROUGH */
a0d0e21e
LW
2701 case '?':
2702 case '+':
2703 case '*':
830247a4 2704 RExC_parse++;
b45f050a 2705 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2706 break;
2707 case '\\':
830247a4 2708 switch (*++RExC_parse) {
a0d0e21e 2709 case 'A':
830247a4
IZ
2710 RExC_seen_zerolen++;
2711 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2712 *flagp |= SIMPLE;
830247a4 2713 nextchar(pRExC_state);
fac92740 2714 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2715 break;
2716 case 'G':
830247a4
IZ
2717 ret = reg_node(pRExC_state, GPOS);
2718 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2719 *flagp |= SIMPLE;
830247a4 2720 nextchar(pRExC_state);
fac92740 2721 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2722 break;
2723 case 'Z':
830247a4 2724 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2725 *flagp |= SIMPLE;
830247a4 2726 nextchar(pRExC_state);
a0d0e21e 2727 break;
b85d18e9 2728 case 'z':
830247a4 2729 ret = reg_node(pRExC_state, EOS);
b85d18e9 2730 *flagp |= SIMPLE;
830247a4
IZ
2731 RExC_seen_zerolen++; /* Do not optimize RE away */
2732 nextchar(pRExC_state);
fac92740 2733 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2734 break;
4a2d328f 2735 case 'C':
830247a4 2736 ret = reg_node(pRExC_state, SANY);
3baa4c62 2737 RExC_seen |= REG_SEEN_SANY;
a0ed51b3 2738 *flagp |= HASWIDTH|SIMPLE;
830247a4 2739 nextchar(pRExC_state);
fac92740 2740 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2741 break;
2742 case 'X':
830247a4 2743 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2744 *flagp |= HASWIDTH;
830247a4 2745 nextchar(pRExC_state);
fac92740 2746 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2747 break;
a0d0e21e 2748 case 'w':
ffc61ed2 2749 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2750 *flagp |= HASWIDTH|SIMPLE;
830247a4 2751 nextchar(pRExC_state);
fac92740 2752 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2753 break;
2754 case 'W':
ffc61ed2 2755 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2756 *flagp |= HASWIDTH|SIMPLE;
830247a4 2757 nextchar(pRExC_state);
fac92740 2758 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2759 break;
2760 case 'b':
830247a4
IZ
2761 RExC_seen_zerolen++;
2762 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2763 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2764 *flagp |= SIMPLE;
830247a4 2765 nextchar(pRExC_state);
fac92740 2766 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2767 break;
2768 case 'B':
830247a4
IZ
2769 RExC_seen_zerolen++;
2770 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2771 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2772 *flagp |= SIMPLE;
830247a4 2773 nextchar(pRExC_state);
fac92740 2774 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2775 break;
2776 case 's':
ffc61ed2 2777 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2778 *flagp |= HASWIDTH|SIMPLE;
830247a4 2779 nextchar(pRExC_state);
fac92740 2780 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2781 break;
2782 case 'S':
ffc61ed2 2783 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2784 *flagp |= HASWIDTH|SIMPLE;
830247a4 2785 nextchar(pRExC_state);
fac92740 2786 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2787 break;
2788 case 'd':
ffc61ed2 2789 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2790 *flagp |= HASWIDTH|SIMPLE;
830247a4 2791 nextchar(pRExC_state);
fac92740 2792 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2793 break;
2794 case 'D':
ffc61ed2 2795 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2796 *flagp |= HASWIDTH|SIMPLE;
830247a4 2797 nextchar(pRExC_state);
fac92740 2798 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2799 break;
a14b48bc
LW
2800 case 'p':
2801 case 'P':
3568d838 2802 {
830247a4 2803 char* oldregxend = RExC_end;
fac92740 2804 char* parse_start = RExC_parse;
a14b48bc 2805
830247a4 2806 if (RExC_parse[1] == '{') {
3568d838 2807 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2808 RExC_end = strchr(RExC_parse, '}');
2809 if (!RExC_end) {
2810 RExC_parse += 2;
2811 RExC_end = oldregxend;
b45f050a
JF
2812 vFAIL("Missing right brace on \\p{}");
2813 }
830247a4 2814 RExC_end++;
a14b48bc
LW
2815 }
2816 else
830247a4
IZ
2817 RExC_end = RExC_parse + 2;
2818 RExC_parse--;
a14b48bc 2819
ffc61ed2 2820 ret = regclass(pRExC_state);
a14b48bc 2821
830247a4
IZ
2822 RExC_end = oldregxend;
2823 RExC_parse--;
fac92740 2824 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2825 nextchar(pRExC_state);
a14b48bc
LW
2826 *flagp |= HASWIDTH|SIMPLE;
2827 }
2828 break;
a0d0e21e
LW
2829 case 'n':
2830 case 'r':
2831 case 't':
2832 case 'f':
2833 case 'e':
2834 case 'a':
2835 case 'x':
2836 case 'c':
2837 case '0':
2838 goto defchar;
2839 case '1': case '2': case '3': case '4':
2840 case '5': case '6': case '7': case '8': case '9':
2841 {
830247a4 2842 I32 num = atoi(RExC_parse);
a0d0e21e 2843
830247a4 2844 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2845 goto defchar;
2846 else {
fac92740 2847 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2848 while (isDIGIT(*RExC_parse))
2849 RExC_parse++;
b45f050a 2850
830247a4 2851 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2852 vFAIL("Reference to nonexistent group");
830247a4
IZ
2853 RExC_sawback = 1;
2854 ret = reganode(pRExC_state, FOLD
a0ed51b3 2855 ? (LOC ? REFFL : REFF)
c8756f30 2856 : REF, num);
a0d0e21e 2857 *flagp |= HASWIDTH;
fac92740
MJD
2858
2859 /* override incorrect value set in reganode MJD */
2860 Set_Node_Offset(ret, parse_start+1);
2861 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
2862 RExC_parse--;
2863 nextchar(pRExC_state);
a0d0e21e
LW
2864 }
2865 }
2866 break;
2867 case '\0':
830247a4 2868 if (RExC_parse >= RExC_end)
b45f050a 2869 FAIL("Trailing \\");
a0d0e21e
LW
2870 /* FALL THROUGH */
2871 default:
c9f97d15
IZ
2872 /* Do not generate `unrecognized' warnings here, we fall
2873 back into the quick-grab loop below */
a0d0e21e
LW
2874 goto defchar;
2875 }
2876 break;
4633a7c4
LW
2877
2878 case '#':
830247a4
IZ
2879 if (RExC_flags16 & PMf_EXTENDED) {
2880 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2881 if (RExC_parse < RExC_end)
4633a7c4
LW
2882 goto tryagain;
2883 }
2884 /* FALL THROUGH */
2885
a0d0e21e 2886 default: {
ba210ebe 2887 register STRLEN len;
a0ed51b3 2888 register UV ender;
a0d0e21e 2889 register char *p;
c277df42 2890 char *oldp, *s;
ba210ebe 2891 STRLEN numlen;
fac92740 2892 char *parse_start = RExC_parse - 1;
a0d0e21e 2893
830247a4 2894 RExC_parse++;
a0d0e21e
LW
2895
2896 defchar:
830247a4 2897 ret = reg_node(pRExC_state, FOLD
a0ed51b3 2898 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 2899 : EXACT);
cd439c50 2900 s = STRING(ret);
830247a4
IZ
2901 for (len = 0, p = RExC_parse - 1;
2902 len < 127 && p < RExC_end;
a0d0e21e
LW
2903 len++)
2904 {
2905 oldp = p;
5b5a24f7 2906
830247a4
IZ
2907 if (RExC_flags16 & PMf_EXTENDED)
2908 p = regwhite(p, RExC_end);
a0d0e21e
LW
2909 switch (*p) {
2910 case '^':
2911 case '$':
2912 case '.':
2913 case '[':
2914 case '(':
2915 case ')':
2916 case '|':
2917 goto loopdone;
2918 case '\\':
2919 switch (*++p) {
2920 case 'A':
2921 case 'G':
2922 case 'Z':
b85d18e9 2923 case 'z':
a0d0e21e
LW
2924 case 'w':
2925 case 'W':
2926 case 'b':
2927 case 'B':
2928 case 's':
2929 case 'S':
2930 case 'd':
2931 case 'D':
a14b48bc
LW
2932 case 'p':
2933 case 'P':
a0d0e21e
LW
2934 --p;
2935 goto loopdone;
2936 case 'n':
2937 ender = '\n';
2938 p++;
a687059c 2939 break;
a0d0e21e
LW
2940 case 'r':
2941 ender = '\r';
2942 p++;
a687059c 2943 break;
a0d0e21e
LW
2944 case 't':
2945 ender = '\t';
2946 p++;
a687059c 2947 break;
a0d0e21e
LW
2948 case 'f':
2949 ender = '\f';
2950 p++;
a687059c 2951 break;
a0d0e21e 2952 case 'e':
c7f1f016 2953 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 2954 p++;
a687059c 2955 break;
a0d0e21e 2956 case 'a':
c7f1f016 2957 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 2958 p++;
a687059c 2959 break;
a0d0e21e 2960 case 'x':
a0ed51b3
LW
2961 if (*++p == '{') {
2962 char* e = strchr(p, '}');
b81d288d 2963
b45f050a 2964 if (!e) {
830247a4 2965 RExC_parse = p + 1;
b45f050a
JF
2966 vFAIL("Missing right brace on \\x{}");
2967 }
de5f0749 2968 else {
b21ed0a9 2969 numlen = 1; /* allow underscores */
de35ba6f 2970 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
aaa80028
JH
2971 if (ender > 0xff)
2972 RExC_utf8 = 1;
b21ed0a9
GS
2973 /* numlen is generous */
2974 if (numlen + len >= 127) {
a0ed51b3
LW
2975 p--;
2976 goto loopdone;
2977 }
2978 p = e + 1;
2979 }
a0ed51b3
LW
2980 }
2981 else {
b21ed0a9 2982 numlen = 0; /* disallow underscores */
dff6d3cd 2983 ender = (UV)scan_hex(p, 2, &numlen);
a0ed51b3
LW
2984 p += numlen;
2985 }
a687059c 2986 break;
a0d0e21e
LW
2987 case 'c':
2988 p++;
bbce6d69
PP
2989 ender = UCHARAT(p++);
2990 ender = toCTRL(ender);
a687059c 2991 break;
a0d0e21e
LW
2992 case '0': case '1': case '2': case '3':case '4':
2993 case '5': case '6': case '7': case '8':case '9':
2994 if (*p == '0' ||
830247a4 2995 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
b21ed0a9 2996 numlen = 0; /* disallow underscores */
dff6d3cd 2997 ender = (UV)scan_oct(p, 3, &numlen);
a0d0e21e
LW
2998 p += numlen;
2999 }
3000 else {
3001 --p;
3002 goto loopdone;
a687059c
LW
3003 }
3004 break;
a0d0e21e 3005 case '\0':
830247a4 3006 if (p >= RExC_end)
b45f050a 3007 FAIL("Trailing \\");
a687059c 3008 /* FALL THROUGH */
a0d0e21e 3009 default:
e476b1b5 3010 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
b45f050a 3011 vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
a0ed51b3 3012 goto normal_default;
a0d0e21e
LW
3013 }
3014 break;
a687059c 3015 default:
a0ed51b3 3016 normal_default:
fd400ab9 3017 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3018 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3019 &numlen, 0);
a0ed51b3
LW
3020 p += numlen;
3021 }
3022 else
3023 ender = *p++;
a0d0e21e 3024 break;
a687059c 3025 }
830247a4
IZ
3026 if (RExC_flags16 & PMf_EXTENDED)
3027 p = regwhite(p, RExC_end);
a0ed51b3
LW
3028 if (UTF && FOLD) {
3029 if (LOC)
5e12f4fb 3030 ender = toLOWER_LC_uvchr(ender);
a0ed51b3
LW
3031 else
3032 ender = toLOWER_uni(ender);
3033 }
a0d0e21e
LW
3034 if (ISMULT2(p)) { /* Back off on ?+*. */
3035 if (len)
3036 p = oldp;
2b9d42f0 3037 else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3038 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3039 s += numlen;
3040 len += numlen;
3041 }
a0d0e21e
LW
3042 else {
3043 len++;
cd439c50 3044 REGC(ender, s++);
a0d0e21e
LW
3045 }
3046 break;
a687059c 3047 }
2b9d42f0 3048 if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
830247a4 3049 reguni(pRExC_state, ender, s, &numlen);
a0ed51b3
LW
3050 s += numlen;
3051 len += numlen - 1;
3052 }
3053 else
cd439c50 3054 REGC(ender, s++);
a0d0e21e
LW
3055 }
3056 loopdone:
830247a4 3057 RExC_parse = p - 1;
fac92740 3058 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3059 nextchar(pRExC_state);
793db0cb
JH
3060 {
3061 /* len is STRLEN which is unsigned, need to copy to signed */
3062 IV iv = len;
3063 if (iv < 0)
3064 vFAIL("Internal disaster");
3065 }
a0d0e21e
LW
3066 if (len > 0)
3067 *flagp |= HASWIDTH;
3068 if (len == 1)
3069 *flagp |= SIMPLE;
c277df42 3070 if (!SIZE_ONLY)
cd439c50
IZ
3071 STR_LEN(ret) = len;
3072 if (SIZE_ONLY)
830247a4 3073 RExC_size += STR_SZ(len);
cd439c50 3074 else
830247a4 3075 RExC_emit += STR_SZ(len);
a687059c 3076 }
a0d0e21e
LW
3077 break;
3078 }
a687059c 3079
a0d0e21e 3080 return(ret);
a687059c
LW
3081}
3082
873ef191 3083STATIC char *
cea2e8a9 3084S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3085{
3086 while (p < e) {
3087 if (isSPACE(*p))
3088 ++p;
3089 else if (*p == '#') {
3090 do {
3091 p++;
3092 } while (p < e && *p != '\n');
3093 }
3094 else
3095 break;
3096 }
3097 return p;
3098}
3099
b8c5462f
JH
3100/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3101 Character classes ([:foo:]) can also be negated ([:^foo:]).
3102 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3103 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3104 but trigger warnings because they are currently unimplemented. */
3105STATIC I32
830247a4 3106S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3107{
3108 char *posixcc = 0;
936ed897 3109 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3110
830247a4 3111 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3112 /* I smell either [: or [= or [. -- POSIX has been here, right? */
830247a4
IZ
3113 (*RExC_parse == ':' ||
3114 *RExC_parse == '=' ||
3115 *RExC_parse == '.')) {
3116 char c = *RExC_parse;
3117 char* s = RExC_parse++;
b81d288d 3118
830247a4
IZ
3119 while (RExC_parse < RExC_end && *RExC_parse != c)
3120 RExC_parse++;
3121 if (RExC_parse == RExC_end)
620e46c5 3122 /* Grandfather lone [:, [=, [. */
830247a4 3123 RExC_parse = s;
620e46c5 3124 else {
830247a4 3125 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3126
830247a4
IZ
3127 if (*RExC_parse == ']') {
3128 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3129 posixcc = s + 1;
3130 if (*s == ':') {
3131 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3132 I32 skip = 5; /* the most common skip */
3133
3134 switch (*posixcc) {
3135 case 'a':
3136 if (strnEQ(posixcc, "alnum", 5))
3137 namedclass =
3138 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3139 else if (strnEQ(posixcc, "alpha", 5))
3140 namedclass =
3141 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3142 else if (strnEQ(posixcc, "ascii", 5))
3143 namedclass =
3144 complement ? ANYOF_NASCII : ANYOF_ASCII;
3145 break;
aaa51d5e
JF
3146 case 'b':
3147 if (strnEQ(posixcc, "blank", 5))
3148 namedclass =
3149 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3150 break;
b8c5462f
JH
3151 case 'c':
3152 if (strnEQ(posixcc, "cntrl", 5))
3153 namedclass =
3154 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3155 break;
3156 case 'd':
3157 if (strnEQ(posixcc, "digit", 5))
3158 namedclass =
3159 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3160 break;
3161 case 'g':
3162 if (strnEQ(posixcc, "graph", 5))
3163 namedclass =
3164 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3165 break;
3166 case 'l':
3167 if (strnEQ(posixcc, "lower", 5))
3168 namedclass =
3169 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3170 break;
3171 case 'p':
3172 if (strnEQ(posixcc, "print", 5))
3173 namedclass =
3174 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3175 else if (strnEQ(posixcc, "punct", 5))
3176 namedclass =
3177 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3178 break;
3179 case 's':
3180 if (strnEQ(posixcc, "space", 5))
3181 namedclass =
aaa51d5e 3182 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3183 break;
b8c5462f
JH
3184 case 'u':
3185 if (strnEQ(posixcc, "upper", 5))
3186 namedclass =
3187 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3188 break;
3189 case 'w': /* this is not POSIX, this is the Perl \w */
3190 if (strnEQ(posixcc, "word", 4)) {
3191 namedclass =
3192 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3193 skip = 4;
3194 }
3195 break;
3196 case 'x':
3197 if (strnEQ(posixcc, "xdigit", 6)) {
3198 namedclass =
3199 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3200 skip = 6;
3201 }
3202 break;
3203 }
ac561586
JH
3204 if (namedclass == OOB_NAMEDCLASS ||
3205 posixcc[skip] != ':' ||
3206 posixcc[skip+1] != ']')
b45f050a
JF
3207 {
3208 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3209 t - s - 1, s + 1);
3210 }
3211 } else if (!SIZE_ONLY) {
b8c5462f 3212 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3213
830247a4 3214 /* adjust RExC_parse so the warning shows after
b45f050a 3215 the class closes */
830247a4
IZ
3216 while (*RExC_parse && *RExC_parse != ']')
3217 RExC_parse++;
b45f050a
JF
3218 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3219 }
b8c5462f
JH
3220 } else {
3221 /* Maternal grandfather:
3222 * "[:" ending in ":" but not in ":]" */
830247a4 3223 RExC_parse = s;
767d463e 3224 }
620e46c5
JH
3225 }
3226 }
3227
b8c5462f
JH
3228 return namedclass;
3229}
3230
3231STATIC void
830247a4 3232S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3233{
e476b1b5 3234 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
830247a4
IZ
3235 (*RExC_parse == ':' ||
3236 *RExC_parse == '=' ||
3237 *RExC_parse == '.')) {
3238 char *s = RExC_parse;
93733859 3239 char c = *s++;
b8c5462f
JH
3240
3241 while(*s && isALNUM(*s))
3242 s++;
3243 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3244 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3245
3246 /* [[=foo=]] and [[.foo.]] are still future. */
b8c5462f 3247 if (c == '=' || c == '.')
b45f050a 3248 {
830247a4 3249 /* adjust RExC_parse so the error shows after
b45f050a 3250 the class closes */
830247a4 3251 while (*RExC_parse && *RExC_parse++ != ']')
b45f050a
JF
3252 ;
3253 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3254 }
b8c5462f
JH
3255 }
3256 }
620e46c5
JH
3257}
3258
76e3520e 3259STATIC regnode *
830247a4 3260S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3261{
ffc61ed2 3262 register UV value;
3568d838 3263 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3264 register IV range = 0;
c277df42 3265 register regnode *ret;
ba210ebe 3266 STRLEN numlen;
ffc61ed2 3267 IV namedclass;
9c5ffd7c 3268 char *rangebegin = 0;
936ed897 3269 bool need_class = 0;
9c5ffd7c 3270 SV *listsv = Nullsv;
ffc61ed2
JH
3271 register char *e;
3272 UV n;
3568d838 3273 bool optimize_invert = TRUE;
ffc61ed2
JH
3274
3275 ret = reganode(pRExC_state, ANYOF, 0);
3276
3277 if (!SIZE_ONLY)
3278 ANYOF_FLAGS(ret) = 0;
3279
3280 if (*RExC_parse == '^') { /* Complement of range. */
3281 RExC_naughty++;
3282 RExC_parse++;
3283 if (!SIZE_ONLY)
3284 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3285 }
a0d0e21e 3286
936ed897 3287 if (SIZE_ONLY)
830247a4 3288 RExC_size += ANYOF_SKIP;
936ed897 3289 else {
830247a4 3290 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3291 if (FOLD)
3292 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3293 if (LOC)
3294 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3295 ANYOF_BITMAP_ZERO(ret);
3296 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3297 }
b8c5462f 3298
e476b1b5 3299 if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
830247a4 3300 checkposixcc(pRExC_state);
b8c5462f 3301
830247a4 3302 if (*RExC_parse == ']' || *RExC_parse == '-')
ffc61ed2
JH
3303 goto charclassloop; /* allow 1st char to be ] or - */
3304
830247a4 3305 while (RExC_parse < RExC_end && *RExC_parse != ']') {
ffc61ed2
JH
3306
3307 charclassloop:
3308
3309 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3310
73b437c8 3311 if (!range)
830247a4 3312 rangebegin = RExC_parse;
ffc61ed2 3313 if (UTF) {
5e12f4fb 3314 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3315 RExC_end - RExC_parse,
3316 &numlen, 0);
ffc61ed2
JH
3317 RExC_parse += numlen;
3318 }
3319 else
3320 value = UCHARAT(RExC_parse++);
620e46c5 3321 if (value == '[')
830247a4 3322 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3323 else if (value == '\\') {
ffc61ed2 3324 if (UTF) {
5e12f4fb 3325 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3326 RExC_end - RExC_parse,
3327 &numlen, 0);
3328 RExC_parse += numlen;
3329 }
3330 else
3331 value = UCHARAT(RExC_parse++);
470c3474 3332 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3333 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3334 * be a problem later if we want switch on Unicode.
3335 * A similar issue a little bit later when switching on
3336 * namedclass. --jhi */
ffc61ed2 3337 switch ((I32)value) {
b8c5462f
JH
3338 case 'w': namedclass = ANYOF_ALNUM; break;
3339 case 'W': namedclass = ANYOF_NALNUM; break;
3340 case 's': namedclass = ANYOF_SPACE; break;
3341 case 'S': namedclass = ANYOF_NSPACE; break;
3342 case 'd': namedclass = ANYOF_DIGIT; break;
3343 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3344 case 'p':
3345 case 'P':
3346 if (*RExC_parse == '{') {
3347 e = strchr(RExC_parse++, '}');
3348 if (!e)
3349 vFAIL("Missing right brace on \\p{}");
3350 n = e - RExC_parse;
3351 }
3352 else {
3353 e = RExC_parse;
3354 n = 1;
3355 }
3356 if (!SIZE_ONLY) {
3357 if (value == 'p')
3358 Perl_sv_catpvf(aTHX_ listsv,
3359 "+utf8::%.*s\n", (int)n, RExC_parse);
3360 else
3361 Perl_sv_catpvf(aTHX_ listsv,
3362 "!utf8::%.*s\n", (int)n, RExC_parse);
3363 }
3364 RExC_parse = e + 1;
3365 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3366 continue;
b8c5462f
JH
3367 case 'n': value = '\n'; break;
3368 case 'r': value = '\r'; break;
3369 case 't': value = '\t'; break;
3370 case 'f': value = '\f'; break;
3371 case 'b': value = '\b'; break;
c7f1f016
NIS
3372 case 'e': value = ASCII_TO_NATIVE('\033');break;
3373 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3374 case 'x':
ffc61ed2
JH
3375 if (*RExC_parse == '{') {
3376 e = strchr(RExC_parse++, '}');
b81d288d 3377 if (!e)
ffc61ed2
JH
3378 vFAIL("Missing right brace on \\x{}");
3379 numlen = 1; /* allow underscores */
3380 value = (UV)scan_hex(RExC_parse,
3381 e - RExC_parse,
3382 &numlen);
3383 RExC_parse = e + 1;
3384 }
3385 else {
3386 numlen = 0; /* disallow underscores */
3387 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3388 RExC_parse += numlen;
3389 }
b8c5462f
JH
3390 break;
3391 case 'c':
830247a4 3392 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3393 value = toCTRL(value);
3394 break;
3395 case '0': case '1': case '2': case '3': case '4':
3396 case '5': case '6': case '7': case '8': case '9':
b21ed0a9 3397 numlen = 0; /* disallow underscores */
830247a4
IZ
3398 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3399 RExC_parse += numlen;
b8c5462f 3400 break;
1028017a 3401 default:
e476b1b5 3402 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3403 vWARN2(RExC_parse,
3404 "Unrecognized escape \\%c in character class passed through",
3405 (int)value);
1028017a 3406 break;
b8c5462f 3407 }
ffc61ed2
JH
3408 } /* end of \blah */
3409
3410 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3411
3412 if (!SIZE_ONLY && !need_class)
936ed897 3413 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3414
936ed897 3415 need_class = 1;
ffc61ed2
JH
3416
3417 /* a bad range like a-\d, a-[:digit:] ? */
3418 if (range) {
73b437c8 3419 if (!SIZE_ONLY) {
e476b1b5 3420 if (ckWARN(WARN_REGEXP))
830247a4 3421 vWARN4(RExC_parse,
b45f050a 3422 "False [] range \"%*.*s\"",
830247a4
IZ
3423 RExC_parse - rangebegin,
3424 RExC_parse - rangebegin,
b45f050a 3425 rangebegin);
3568d838
JH
3426 if (prevvalue < 256) {
3427 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3428 ANYOF_BITMAP_SET(ret, '-');
3429 }
3430 else {
3431 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3432 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3433 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3434 }
b8c5462f 3435 }
ffc61ed2
JH
3436
3437 range = 0; /* this was not a true range */
73b437c8 3438 }
ffc61ed2 3439
73b437c8 3440 if (!SIZE_ONLY) {
3568d838
JH
3441 if (namedclass > OOB_NAMEDCLASS)
3442 optimize_invert = FALSE;
e2962f66
JH
3443 /* Possible truncation here but in some 64-bit environments
3444 * the compiler gets heartburn about switch on 64-bit values.
3445 * A similar issue a little earlier when switching on value.
98f323fa 3446 * --jhi */
e2962f66 3447 switch ((I32)namedclass) {
73b437c8
JH
3448 case ANYOF_ALNUM:
3449 if (LOC)
936ed897 3450 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3451 else {
3452 for (value = 0; value < 256; value++)
3453 if (isALNUM(value))
936ed897 3454 ANYOF_BITMAP_SET(ret, value);
73b437c8 3455 }
ffc61ed2 3456 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3457 break;