This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: sitecustomize.pl [PATCH]
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
b81d288d 209static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 210 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380
MP
270#define FAIL(msg) STMT_START { \
271 char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380
MP
290#define FAIL2(pat,msg) STMT_START { \
291 char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
431/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
a0ed51b3
LW
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c
JH
663/*
664 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665 * These need to be revisited when a newer toolchain becomes available.
666 */
667#if defined(__sparc64__) && defined(__GNUC__)
668# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669# undef SPARC64_GCC_WORKAROUND
670# define SPARC64_GCC_WORKAROUND 1
671# endif
672#endif
673
653099ff
GS
674/* REx optimizer. Converts nodes into quickier variants "in place".
675 Finds fixed substrings. */
676
c277df42
IZ
677/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
678 to the position after last scanned or to NULL. */
679
76e3520e 680STATIC I32
830247a4 681S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
682 /* scanp: Start here (read-write). */
683 /* deltap: Write maxlen-minlen here. */
684 /* last: Stop before this one. */
685{
686 I32 min = 0, pars = 0, code;
687 regnode *scan = *scanp, *next;
688 I32 delta = 0;
689 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 690 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
691 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692 scan_data_t data_fake;
653099ff 693 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 694
c277df42
IZ
695 while (scan && OP(scan) != END && scan < last) {
696 /* Peephole optimizer: */
697
22c35a8c 698 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 699 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
700 regnode *n = regnext(scan);
701 U32 stringok = 1;
702#ifdef DEBUGGING
703 regnode *stop = scan;
b81d288d 704#endif
c277df42 705
cd439c50 706 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
707 /* Skip NOTHING, merge EXACT*. */
708 while (n &&
b81d288d 709 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
710 (stringok && (OP(n) == OP(scan))))
711 && NEXT_OFF(n)
712 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713 if (OP(n) == TAIL || n > next)
714 stringok = 0;
22c35a8c 715 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
716 NEXT_OFF(scan) += NEXT_OFF(n);
717 next = n + NODE_STEP_REGNODE;
718#ifdef DEBUGGING
719 if (stringok)
720 stop = n;
b81d288d 721#endif
c277df42 722 n = regnext(n);
a0ed51b3 723 }
f49d4d0f 724 else if (stringok) {
cd439c50 725 int oldl = STR_LEN(scan);
c277df42 726 regnode *nnext = regnext(n);
f49d4d0f 727
b81d288d 728 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
729 break;
730 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
731 STR_LEN(scan) += STR_LEN(n);
732 next = n + NODE_SZ_STR(n);
c277df42 733 /* Now we can overwrite *n : */
f49d4d0f 734 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 735#ifdef DEBUGGING
f49d4d0f 736 stop = next - 1;
b81d288d 737#endif
c277df42
IZ
738 n = nnext;
739 }
740 }
61a36c01 741
d65e4eab 742 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
743/*
744 Two problematic code points in Unicode casefolding of EXACT nodes:
745
746 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
748
749 which casefold to
750
751 Unicode UTF-8
752
753 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
754 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
755
756 This means that in case-insensitive matching (or "loose matching",
757 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758 length of the above casefolded versions) can match a target string
759 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760 This would rather mess up the minimum length computation.
761
762 What we'll do is to look for the tail four bytes, and then peek
763 at the preceding two bytes to see whether we need to decrease
764 the minimum length by four (six minus two).
765
766 Thanks to the design of UTF-8, there cannot be false matches:
767 A sequence of valid UTF-8 bytes cannot be a subsequence of
768 another valid sequence of UTF-8 bytes.
769
770*/
771 char *s0 = STRING(scan), *s, *t;
772 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773 char *t0 = "\xcc\x88\xcc\x81";
774 char *t1 = t0 + 3;
775
776 for (s = s0 + 2;
777 s < s2 && (t = ninstr(s, s1, t0, t1));
778 s = t + 4) {
779 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
781 min -= 4;
782 }
783 }
784
c277df42
IZ
785#ifdef DEBUGGING
786 /* Allow dumping */
cd439c50 787 n = scan + NODE_SZ_STR(scan);
c277df42 788 while (n <= stop) {
22c35a8c 789 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
790 OP(n) = OPTIMIZED;
791 NEXT_OFF(n) = 0;
792 }
793 n++;
794 }
653099ff 795#endif
c277df42 796 }
653099ff
GS
797 /* Follow the next-chain of the current node and optimize
798 away all the NOTHINGs from it. */
c277df42 799 if (OP(scan) != CURLYX) {
048cfca1
GS
800 int max = (reg_off_by_arg[OP(scan)]
801 ? I32_MAX
802 /* I32 may be smaller than U16 on CRAYs! */
803 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
804 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805 int noff;
806 regnode *n = scan;
b81d288d 807
c277df42
IZ
808 /* Skip NOTHING and LONGJMP. */
809 while ((n = regnext(n))
22c35a8c 810 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
811 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
812 && off + noff < max)
813 off += noff;
814 if (reg_off_by_arg[OP(scan)])
815 ARG(scan) = off;
b81d288d 816 else
c277df42
IZ
817 NEXT_OFF(scan) = off;
818 }
653099ff
GS
819 /* The principal pseudo-switch. Cannot be a switch, since we
820 look into several different things. */
b81d288d 821 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
822 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823 next = regnext(scan);
824 code = OP(scan);
b81d288d
AB
825
826 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 827 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 828 struct regnode_charclass_class accum;
c277df42 829
653099ff 830 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 831 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 832 if (flags & SCF_DO_STCLASS)
830247a4 833 cl_init_zero(pRExC_state, &accum);
c277df42 834 while (OP(scan) == code) {
830247a4 835 I32 deltanext, minnext, f = 0, fake;
653099ff 836 struct regnode_charclass_class this_class;
c277df42
IZ
837
838 num++;
839 data_fake.flags = 0;
b81d288d 840 if (data) {
2c2d71f5 841 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
842 data_fake.last_closep = data->last_closep;
843 }
844 else
845 data_fake.last_closep = &fake;
c277df42
IZ
846 next = regnext(scan);
847 scan = NEXTOPER(scan);
848 if (code != BRANCH)
849 scan = NEXTOPER(scan);
653099ff 850 if (flags & SCF_DO_STCLASS) {
830247a4 851 cl_init(pRExC_state, &this_class);
653099ff
GS
852 data_fake.start_class = &this_class;
853 f = SCF_DO_STCLASS_AND;
b81d288d 854 }
e1901655
IZ
855 if (flags & SCF_WHILEM_VISITED_POS)
856 f |= SCF_WHILEM_VISITED_POS;
653099ff 857 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
858 minnext = study_chunk(pRExC_state, &scan, &deltanext,
859 next, &data_fake, f);
b81d288d 860 if (min1 > minnext)
c277df42
IZ
861 min1 = minnext;
862 if (max1 < minnext + deltanext)
863 max1 = minnext + deltanext;
864 if (deltanext == I32_MAX)
aca2d497 865 is_inf = is_inf_internal = 1;
c277df42
IZ
866 scan = next;
867 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
868 pars++;
405ff068 869 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 870 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
871 if (data)
872 data->whilem_c = data_fake.whilem_c;
653099ff 873 if (flags & SCF_DO_STCLASS)
830247a4 874 cl_or(pRExC_state, &accum, &this_class);
b81d288d 875 if (code == SUSPEND)
c277df42
IZ
876 break;
877 }
878 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
879 min1 = 0;
880 if (flags & SCF_DO_SUBSTR) {
881 data->pos_min += min1;
882 data->pos_delta += max1 - min1;
883 if (max1 != min1 || is_inf)
884 data->longest = &(data->longest_float);
885 }
886 min += min1;
887 delta += max1 - min1;
653099ff 888 if (flags & SCF_DO_STCLASS_OR) {
830247a4 889 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
890 if (min1) {
891 cl_and(data->start_class, &and_with);
892 flags &= ~SCF_DO_STCLASS;
893 }
894 }
895 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
896 if (min1) {
897 cl_and(data->start_class, &accum);
653099ff 898 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
899 }
900 else {
b81d288d 901 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
902 * data->start_class */
903 StructCopy(data->start_class, &and_with,
904 struct regnode_charclass_class);
905 flags &= ~SCF_DO_STCLASS_AND;
906 StructCopy(&accum, data->start_class,
907 struct regnode_charclass_class);
908 flags |= SCF_DO_STCLASS_OR;
909 data->start_class->flags |= ANYOF_EOS;
910 }
653099ff 911 }
a0ed51b3
LW
912 }
913 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
914 scan = NEXTOPER(NEXTOPER(scan));
915 else /* single branch is optimized. */
916 scan = NEXTOPER(scan);
917 continue;
a0ed51b3
LW
918 }
919 else if (OP(scan) == EXACT) {
cd439c50 920 I32 l = STR_LEN(scan);
1aa99e6b 921 UV uc = *((U8*)STRING(scan));
a0ed51b3 922 if (UTF) {
1aa99e6b
IH
923 U8 *s = (U8*)STRING(scan);
924 l = utf8_length(s, s + l);
9041c2e3 925 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
926 }
927 min += l;
c277df42 928 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
929 /* The code below prefers earlier match for fixed
930 offset, later match for variable offset. */
931 if (data->last_end == -1) { /* Update the start info. */
932 data->last_start_min = data->pos_min;
933 data->last_start_max = is_inf
b81d288d 934 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 935 }
cd439c50 936 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
937 {
938 SV * sv = data->last_found;
939 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940 mg_find(sv, PERL_MAGIC_utf8) : NULL;
941 if (mg && mg->mg_len >= 0)
5e43f467
JH
942 mg->mg_len += utf8_length((U8*)STRING(scan),
943 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 944 }
33b8afdf
JH
945 if (UTF)
946 SvUTF8_on(data->last_found);
c277df42
IZ
947 data->last_end = data->pos_min + l;
948 data->pos_min += l; /* As in the first entry. */
949 data->flags &= ~SF_BEFORE_EOL;
950 }
653099ff
GS
951 if (flags & SCF_DO_STCLASS_AND) {
952 /* Check whether it is compatible with what we know already! */
953 int compat = 1;
954
1aa99e6b 955 if (uc >= 0x100 ||
516a5887 956 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 957 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 958 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 959 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 960 )
653099ff
GS
961 compat = 0;
962 ANYOF_CLASS_ZERO(data->start_class);
963 ANYOF_BITMAP_ZERO(data->start_class);
964 if (compat)
1aa99e6b 965 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 966 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
967 if (uc < 0x100)
968 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
969 }
970 else if (flags & SCF_DO_STCLASS_OR) {
971 /* false positive possible if the class is case-folded */
1aa99e6b 972 if (uc < 0x100)
9b877dbb
IH
973 ANYOF_BITMAP_SET(data->start_class, uc);
974 else
975 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
976 data->start_class->flags &= ~ANYOF_EOS;
977 cl_and(data->start_class, &and_with);
978 }
979 flags &= ~SCF_DO_STCLASS;
a0ed51b3 980 }
653099ff 981 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 982 I32 l = STR_LEN(scan);
1aa99e6b 983 UV uc = *((U8*)STRING(scan));
653099ff
GS
984
985 /* Search for fixed substrings supports EXACT only. */
b81d288d 986 if (flags & SCF_DO_SUBSTR)
830247a4 987 scan_commit(pRExC_state, data);
a0ed51b3 988 if (UTF) {
1aa99e6b
IH
989 U8 *s = (U8 *)STRING(scan);
990 l = utf8_length(s, s + l);
9041c2e3 991 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
992 }
993 min += l;
c277df42 994 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 995 data->pos_min += l;
653099ff
GS
996 if (flags & SCF_DO_STCLASS_AND) {
997 /* Check whether it is compatible with what we know already! */
998 int compat = 1;
999
1aa99e6b 1000 if (uc >= 0x100 ||
516a5887 1001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 1002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 1003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
1004 compat = 0;
1005 ANYOF_CLASS_ZERO(data->start_class);
1006 ANYOF_BITMAP_ZERO(data->start_class);
1007 if (compat) {
1aa99e6b 1008 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 data->start_class->flags |= ANYOF_FOLD;
1011 if (OP(scan) == EXACTFL)
1012 data->start_class->flags |= ANYOF_LOCALE;
1013 }
1014 }
1015 else if (flags & SCF_DO_STCLASS_OR) {
1016 if (data->start_class->flags & ANYOF_FOLD) {
1017 /* false positive possible if the class is case-folded.
1018 Assume that the locale settings are the same... */
1aa99e6b
IH
1019 if (uc < 0x100)
1020 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1021 data->start_class->flags &= ~ANYOF_EOS;
1022 }
1023 cl_and(data->start_class, &and_with);
1024 }
1025 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1026 }
4d61ec05 1027 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1028 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1029 I32 f = flags, pos_before = 0;
c277df42 1030 regnode *oscan = scan;
653099ff
GS
1031 struct regnode_charclass_class this_class;
1032 struct regnode_charclass_class *oclass = NULL;
727f22e3 1033 I32 next_is_eval = 0;
653099ff 1034
22c35a8c 1035 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1036 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1037 scan = NEXTOPER(scan);
1038 goto finish;
1039 case PLUS:
653099ff 1040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1041 next = NEXTOPER(scan);
653099ff 1042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1043 mincount = 1;
1044 maxcount = REG_INFTY;
c277df42
IZ
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1047 goto do_curly;
1048 }
1049 }
1050 if (flags & SCF_DO_SUBSTR)
1051 data->pos_min++;
1052 min++;
1053 /* Fall through. */
1054 case STAR:
653099ff
GS
1055 if (flags & SCF_DO_STCLASS) {
1056 mincount = 0;
b81d288d 1057 maxcount = REG_INFTY;
653099ff
GS
1058 next = regnext(scan);
1059 scan = NEXTOPER(scan);
1060 goto do_curly;
1061 }
b81d288d 1062 is_inf = is_inf_internal = 1;
c277df42
IZ
1063 scan = regnext(scan);
1064 if (flags & SCF_DO_SUBSTR) {
830247a4 1065 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1066 data->longest = &(data->longest_float);
1067 }
1068 goto optimize_curly_tail;
1069 case CURLY:
b81d288d 1070 mincount = ARG1(scan);
c277df42
IZ
1071 maxcount = ARG2(scan);
1072 next = regnext(scan);
cb434fcc
IZ
1073 if (OP(scan) == CURLYX) {
1074 I32 lp = (data ? *(data->last_closep) : 0);
1075
1076 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1077 }
c277df42 1078 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1079 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1080 do_curly:
1081 if (flags & SCF_DO_SUBSTR) {
830247a4 1082 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1083 pos_before = data->pos_min;
1084 }
1085 if (data) {
1086 fl = data->flags;
1087 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1088 if (is_inf)
1089 data->flags |= SF_IS_INF;
1090 }
653099ff 1091 if (flags & SCF_DO_STCLASS) {
830247a4 1092 cl_init(pRExC_state, &this_class);
653099ff
GS
1093 oclass = data->start_class;
1094 data->start_class = &this_class;
1095 f |= SCF_DO_STCLASS_AND;
1096 f &= ~SCF_DO_STCLASS_OR;
1097 }
e1901655
IZ
1098 /* These are the cases when once a subexpression
1099 fails at a particular position, it cannot succeed
1100 even after backtracking at the enclosing scope.
b81d288d 1101
e1901655
IZ
1102 XXXX what if minimal match and we are at the
1103 initial run of {n,m}? */
1104 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1106
c277df42 1107 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1108 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1109 mincount == 0
653099ff
GS
1110 ? (f & ~SCF_DO_SUBSTR) : f);
1111
1112 if (flags & SCF_DO_STCLASS)
1113 data->start_class = oclass;
1114 if (mincount == 0 || minnext == 0) {
1115 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1116 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1117 }
1118 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1119 /* Switch to OR mode: cache the old value of
653099ff
GS
1120 * data->start_class */
1121 StructCopy(data->start_class, &and_with,
1122 struct regnode_charclass_class);
1123 flags &= ~SCF_DO_STCLASS_AND;
1124 StructCopy(&this_class, data->start_class,
1125 struct regnode_charclass_class);
1126 flags |= SCF_DO_STCLASS_OR;
1127 data->start_class->flags |= ANYOF_EOS;
1128 }
1129 } else { /* Non-zero len */
1130 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1131 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1132 cl_and(data->start_class, &and_with);
1133 }
1134 else if (flags & SCF_DO_STCLASS_AND)
1135 cl_and(data->start_class, &this_class);
1136 flags &= ~SCF_DO_STCLASS;
1137 }
c277df42
IZ
1138 if (!scan) /* It was not CURLYX, but CURLY. */
1139 scan = next;
84037bb0 1140 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1141 /* ? quantifier ok, except for (?{ ... }) */
1142 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1143 && (minnext == 0) && (deltanext == 0)
99799961 1144 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1145 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1146 {
830247a4 1147 vWARN(RExC_parse,
b45f050a
JF
1148 "Quantifier unexpected on zero-length expression");
1149 }
1150
c277df42 1151 min += minnext * mincount;
b81d288d 1152 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1153 && (minnext + deltanext) > 0)
1154 || deltanext == I32_MAX);
aca2d497 1155 is_inf |= is_inf_internal;
c277df42
IZ
1156 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1157
1158 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1159 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1160 && data->flags & SF_IN_PAR
1161 && !(data->flags & SF_HAS_EVAL)
1162 && !deltanext && minnext == 1 ) {
1163 /* Try to optimize to CURLYN. */
1164 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1165 regnode *nxt1 = nxt;
1166#ifdef DEBUGGING
1167 regnode *nxt2;
1168#endif
c277df42
IZ
1169
1170 /* Skip open. */
1171 nxt = regnext(nxt);
4d61ec05 1172 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1173 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1174 && STR_LEN(nxt) == 1))
c277df42 1175 goto nogo;
497b47a8 1176#ifdef DEBUGGING
c277df42 1177 nxt2 = nxt;
497b47a8 1178#endif
c277df42 1179 nxt = regnext(nxt);
b81d288d 1180 if (OP(nxt) != CLOSE)
c277df42
IZ
1181 goto nogo;
1182 /* Now we know that nxt2 is the only contents: */
eb160463 1183 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1184 OP(oscan) = CURLYN;
1185 OP(nxt1) = NOTHING; /* was OPEN. */
1186#ifdef DEBUGGING
1187 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1191 OP(nxt + 1) = OPTIMIZED; /* was count. */
1192 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1193#endif
c277df42 1194 }
c277df42
IZ
1195 nogo:
1196
1197 /* Try optimization CURLYX => CURLYM. */
b81d288d 1198 if ( OP(oscan) == CURLYX && data
c277df42 1199 && !(data->flags & SF_HAS_PAR)
c277df42 1200 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
1201 && !deltanext /* atom is fixed width */
1202 && minnext != 0 /* CURLYM can't handle zero width */
1203 ) {
c277df42
IZ
1204 /* XXXX How to optimize if data == 0? */
1205 /* Optimize to a simpler form. */
1206 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1207 regnode *nxt2;
1208
1209 OP(oscan) = CURLYM;
1210 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1211 && (OP(nxt2) != WHILEM))
c277df42
IZ
1212 nxt = nxt2;
1213 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1214 /* Need to optimize away parenths. */
1215 if (data->flags & SF_IN_PAR) {
1216 /* Set the parenth number. */
1217 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1218
b81d288d 1219 if (OP(nxt) != CLOSE)
b45f050a 1220 FAIL("Panic opt close");
eb160463 1221 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1222 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1223 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1224#ifdef DEBUGGING
1225 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226 OP(nxt + 1) = OPTIMIZED; /* was count. */
1227 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1229#endif
c277df42
IZ
1230#if 0
1231 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232 regnode *nnxt = regnext(nxt1);
b81d288d 1233
c277df42
IZ
1234 if (nnxt == nxt) {
1235 if (reg_off_by_arg[OP(nxt1)])
1236 ARG_SET(nxt1, nxt2 - nxt1);
1237 else if (nxt2 - nxt1 < U16_MAX)
1238 NEXT_OFF(nxt1) = nxt2 - nxt1;
1239 else
1240 OP(nxt) = NOTHING; /* Cannot beautify */
1241 }
1242 nxt1 = nnxt;
1243 }
1244#endif
1245 /* Optimize again: */
b81d288d 1246 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1247 NULL, 0);
a0ed51b3
LW
1248 }
1249 else
c277df42 1250 oscan->flags = 0;
c277df42 1251 }
e1901655
IZ
1252 else if ((OP(oscan) == CURLYX)
1253 && (flags & SCF_WHILEM_VISITED_POS)
1254 /* See the comment on a similar expression above.
1255 However, this time it not a subexpression
1256 we care about, but the expression itself. */
1257 && (maxcount == REG_INFTY)
1258 && data && ++data->whilem_c < 16) {
1259 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1260 /* Find WHILEM (as in regexec.c) */
1261 regnode *nxt = oscan + NEXT_OFF(oscan);
1262
1263 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1264 nxt += ARG(nxt);
eb160463
GS
1265 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1267 }
b81d288d 1268 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1269 pars++;
1270 if (flags & SCF_DO_SUBSTR) {
1271 SV *last_str = Nullsv;
1272 int counted = mincount != 0;
1273
1274 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1275#if defined(SPARC64_GCC_WORKAROUND)
1276 I32 b = 0;
1277 STRLEN l = 0;
1278 char *s = NULL;
1279 I32 old = 0;
1280
1281 if (pos_before >= data->last_start_min)
1282 b = pos_before;
1283 else
1284 b = data->last_start_min;
1285
1286 l = 0;
1287 s = SvPV(data->last_found, l);
1288 old = b - data->last_start_min;
1289
1290#else
b81d288d 1291 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1292 ? pos_before : data->last_start_min;
1293 STRLEN l;
1294 char *s = SvPV(data->last_found, l);
a0ed51b3 1295 I32 old = b - data->last_start_min;
5d1c421c 1296#endif
a0ed51b3
LW
1297
1298 if (UTF)
1299 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1300
a0ed51b3 1301 l -= old;
c277df42 1302 /* Get the added string: */
79cb57f6 1303 last_str = newSVpvn(s + old, l);
0e933229
IH
1304 if (UTF)
1305 SvUTF8_on(last_str);
c277df42
IZ
1306 if (deltanext == 0 && pos_before == b) {
1307 /* What was added is a constant string */
1308 if (mincount > 1) {
1309 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1310 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1311 SvPVX(last_str), l, mincount - 1);
1312 SvCUR(last_str) *= mincount;
1313 /* Add additional parts. */
b81d288d 1314 SvCUR_set(data->last_found,
c277df42
IZ
1315 SvCUR(data->last_found) - l);
1316 sv_catsv(data->last_found, last_str);
0eda9292
JH
1317 {
1318 SV * sv = data->last_found;
1319 MAGIC *mg =
1320 SvUTF8(sv) && SvMAGICAL(sv) ?
1321 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322 if (mg && mg->mg_len >= 0)
1323 mg->mg_len += CHR_SVLEN(last_str);
1324 }
c277df42
IZ
1325 data->last_end += l * (mincount - 1);
1326 }
2a8d9689
HS
1327 } else {
1328 /* start offset must point into the last copy */
1329 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1330 data->last_start_max += is_inf ? I32_MAX
1331 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1332 }
1333 }
1334 /* It is counted once already... */
1335 data->pos_min += minnext * (mincount - counted);
1336 data->pos_delta += - counted * deltanext +
1337 (minnext + deltanext) * maxcount - minnext * mincount;
1338 if (mincount != maxcount) {
653099ff
GS
1339 /* Cannot extend fixed substrings found inside
1340 the group. */
830247a4 1341 scan_commit(pRExC_state,data);
c277df42
IZ
1342 if (mincount && last_str) {
1343 sv_setsv(data->last_found, last_str);
1344 data->last_end = data->pos_min;
b81d288d 1345 data->last_start_min =
a0ed51b3 1346 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1347 data->last_start_max = is_inf
1348 ? I32_MAX
c277df42 1349 : data->pos_min + data->pos_delta
a0ed51b3 1350 - CHR_SVLEN(last_str);
c277df42
IZ
1351 }
1352 data->longest = &(data->longest_float);
1353 }
aca2d497 1354 SvREFCNT_dec(last_str);
c277df42 1355 }
405ff068 1356 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1357 data->flags |= SF_HAS_EVAL;
1358 optimize_curly_tail:
c277df42 1359 if (OP(oscan) != CURLYX) {
22c35a8c 1360 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1361 && NEXT_OFF(next))
1362 NEXT_OFF(oscan) += NEXT_OFF(next);
1363 }
c277df42 1364 continue;
653099ff 1365 default: /* REF and CLUMP only? */
c277df42 1366 if (flags & SCF_DO_SUBSTR) {
830247a4 1367 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1368 data->longest = &(data->longest_float);
1369 }
aca2d497 1370 is_inf = is_inf_internal = 1;
653099ff 1371 if (flags & SCF_DO_STCLASS_OR)
830247a4 1372 cl_anything(pRExC_state, data->start_class);
653099ff 1373 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1374 break;
1375 }
a0ed51b3 1376 }
ffc61ed2 1377 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1378 int value = 0;
653099ff 1379
c277df42 1380 if (flags & SCF_DO_SUBSTR) {
830247a4 1381 scan_commit(pRExC_state,data);
c277df42
IZ
1382 data->pos_min++;
1383 }
1384 min++;
653099ff
GS
1385 if (flags & SCF_DO_STCLASS) {
1386 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1387
1388 /* Some of the logic below assumes that switching
1389 locale on will only add false positives. */
1390 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1391 case SANY:
653099ff
GS
1392 default:
1393 do_default:
1394 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1396 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1397 break;
1398 case REG_ANY:
1399 if (OP(scan) == SANY)
1400 goto do_default;
1401 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1404 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1405 }
1406 if (flags & SCF_DO_STCLASS_AND || !value)
1407 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1408 break;
1409 case ANYOF:
1410 if (flags & SCF_DO_STCLASS_AND)
1411 cl_and(data->start_class,
1412 (struct regnode_charclass_class*)scan);
1413 else
830247a4 1414 cl_or(pRExC_state, data->start_class,
653099ff
GS
1415 (struct regnode_charclass_class*)scan);
1416 break;
1417 case ALNUM:
1418 if (flags & SCF_DO_STCLASS_AND) {
1419 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421 for (value = 0; value < 256; value++)
1422 if (!isALNUM(value))
1423 ANYOF_BITMAP_CLEAR(data->start_class, value);
1424 }
1425 }
1426 else {
1427 if (data->start_class->flags & ANYOF_LOCALE)
1428 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1429 else {
1430 for (value = 0; value < 256; value++)
1431 if (isALNUM(value))
b81d288d 1432 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1433 }
1434 }
1435 break;
1436 case ALNUML:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (data->start_class->flags & ANYOF_LOCALE)
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1440 }
1441 else {
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443 data->start_class->flags |= ANYOF_LOCALE;
1444 }
1445 break;
1446 case NALNUM:
1447 if (flags & SCF_DO_STCLASS_AND) {
1448 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450 for (value = 0; value < 256; value++)
1451 if (isALNUM(value))
1452 ANYOF_BITMAP_CLEAR(data->start_class, value);
1453 }
1454 }
1455 else {
1456 if (data->start_class->flags & ANYOF_LOCALE)
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458 else {
1459 for (value = 0; value < 256; value++)
1460 if (!isALNUM(value))
b81d288d 1461 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1462 }
1463 }
1464 break;
1465 case NALNUML:
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (data->start_class->flags & ANYOF_LOCALE)
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1469 }
1470 else {
1471 data->start_class->flags |= ANYOF_LOCALE;
1472 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1473 }
1474 break;
1475 case SPACE:
1476 if (flags & SCF_DO_STCLASS_AND) {
1477 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479 for (value = 0; value < 256; value++)
1480 if (!isSPACE(value))
1481 ANYOF_BITMAP_CLEAR(data->start_class, value);
1482 }
1483 }
1484 else {
1485 if (data->start_class->flags & ANYOF_LOCALE)
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487 else {
1488 for (value = 0; value < 256; value++)
1489 if (isSPACE(value))
b81d288d 1490 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1491 }
1492 }
1493 break;
1494 case SPACEL:
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (data->start_class->flags & ANYOF_LOCALE)
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1498 }
1499 else {
1500 data->start_class->flags |= ANYOF_LOCALE;
1501 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1502 }
1503 break;
1504 case NSPACE:
1505 if (flags & SCF_DO_STCLASS_AND) {
1506 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508 for (value = 0; value < 256; value++)
1509 if (isSPACE(value))
1510 ANYOF_BITMAP_CLEAR(data->start_class, value);
1511 }
1512 }
1513 else {
1514 if (data->start_class->flags & ANYOF_LOCALE)
1515 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1516 else {
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
b81d288d 1519 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1520 }
1521 }
1522 break;
1523 case NSPACEL:
1524 if (flags & SCF_DO_STCLASS_AND) {
1525 if (data->start_class->flags & ANYOF_LOCALE) {
1526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527 for (value = 0; value < 256; value++)
1528 if (!isSPACE(value))
1529 ANYOF_BITMAP_CLEAR(data->start_class, value);
1530 }
1531 }
1532 else {
1533 data->start_class->flags |= ANYOF_LOCALE;
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1535 }
1536 break;
1537 case DIGIT:
1538 if (flags & SCF_DO_STCLASS_AND) {
1539 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
1542 ANYOF_BITMAP_CLEAR(data->start_class, value);
1543 }
1544 else {
1545 if (data->start_class->flags & ANYOF_LOCALE)
1546 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1547 else {
1548 for (value = 0; value < 256; value++)
1549 if (isDIGIT(value))
b81d288d 1550 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1551 }
1552 }
1553 break;
1554 case NDIGIT:
1555 if (flags & SCF_DO_STCLASS_AND) {
1556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557 for (value = 0; value < 256; value++)
1558 if (isDIGIT(value))
1559 ANYOF_BITMAP_CLEAR(data->start_class, value);
1560 }
1561 else {
1562 if (data->start_class->flags & ANYOF_LOCALE)
1563 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1564 else {
1565 for (value = 0; value < 256; value++)
1566 if (!isDIGIT(value))
b81d288d 1567 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1568 }
1569 }
1570 break;
1571 }
1572 if (flags & SCF_DO_STCLASS_OR)
1573 cl_and(data->start_class, &and_with);
1574 flags &= ~SCF_DO_STCLASS;
1575 }
a0ed51b3 1576 }
22c35a8c 1577 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1578 data->flags |= (OP(scan) == MEOL
1579 ? SF_BEFORE_MEOL
1580 : SF_BEFORE_SEOL);
a0ed51b3 1581 }
653099ff
GS
1582 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1583 /* Lookbehind, or need to calculate parens/evals/stclass: */
1584 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1585 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1586 /* Lookahead/lookbehind */
cb434fcc 1587 I32 deltanext, minnext, fake = 0;
c277df42 1588 regnode *nscan;
653099ff
GS
1589 struct regnode_charclass_class intrnl;
1590 int f = 0;
c277df42
IZ
1591
1592 data_fake.flags = 0;
b81d288d 1593 if (data) {
2c2d71f5 1594 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1595 data_fake.last_closep = data->last_closep;
1596 }
1597 else
1598 data_fake.last_closep = &fake;
653099ff
GS
1599 if ( flags & SCF_DO_STCLASS && !scan->flags
1600 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1601 cl_init(pRExC_state, &intrnl);
653099ff 1602 data_fake.start_class = &intrnl;
e1901655 1603 f |= SCF_DO_STCLASS_AND;
653099ff 1604 }
e1901655
IZ
1605 if (flags & SCF_WHILEM_VISITED_POS)
1606 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1607 next = regnext(scan);
1608 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1609 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1610 if (scan->flags) {
1611 if (deltanext) {
9baa0206 1612 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1613 }
1614 else if (minnext > U8_MAX) {
9baa0206 1615 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1616 }
eb160463 1617 scan->flags = (U8)minnext;
c277df42
IZ
1618 }
1619 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1620 pars++;
405ff068 1621 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1622 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1623 if (data)
1624 data->whilem_c = data_fake.whilem_c;
e1901655 1625 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1626 int was = (data->start_class->flags & ANYOF_EOS);
1627
1628 cl_and(data->start_class, &intrnl);
1629 if (was)
1630 data->start_class->flags |= ANYOF_EOS;
1631 }
a0ed51b3
LW
1632 }
1633 else if (OP(scan) == OPEN) {
c277df42 1634 pars++;
a0ed51b3 1635 }
cb434fcc 1636 else if (OP(scan) == CLOSE) {
eb160463 1637 if ((I32)ARG(scan) == is_par) {
cb434fcc 1638 next = regnext(scan);
c277df42 1639
cb434fcc
IZ
1640 if ( next && (OP(next) != WHILEM) && next < last)
1641 is_par = 0; /* Disable optimization */
1642 }
1643 if (data)
1644 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1645 }
1646 else if (OP(scan) == EVAL) {
c277df42
IZ
1647 if (data)
1648 data->flags |= SF_HAS_EVAL;
1649 }
96776eda 1650 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1651 if (flags & SCF_DO_SUBSTR) {
830247a4 1652 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1653 data->longest = &(data->longest_float);
1654 }
1655 is_inf = is_inf_internal = 1;
653099ff 1656 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1657 cl_anything(pRExC_state, data->start_class);
96776eda 1658 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1659 }
c277df42
IZ
1660 /* Else: zero-length, ignore. */
1661 scan = regnext(scan);
1662 }
1663
1664 finish:
1665 *scanp = scan;
aca2d497 1666 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1667 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1668 data->pos_delta = I32_MAX - data->pos_min;
1669 if (is_par > U8_MAX)
1670 is_par = 0;
1671 if (is_par && pars==1 && data) {
1672 data->flags |= SF_IN_PAR;
1673 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1674 }
1675 else if (pars && data) {
c277df42
IZ
1676 data->flags |= SF_HAS_PAR;
1677 data->flags &= ~SF_IN_PAR;
1678 }
653099ff
GS
1679 if (flags & SCF_DO_STCLASS_OR)
1680 cl_and(data->start_class, &and_with);
c277df42
IZ
1681 return min;
1682}
1683
76e3520e 1684STATIC I32
830247a4 1685S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1686{
830247a4 1687 if (RExC_rx->data) {
b81d288d
AB
1688 Renewc(RExC_rx->data,
1689 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1690 char, struct reg_data);
830247a4
IZ
1691 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692 RExC_rx->data->count += n;
a0ed51b3
LW
1693 }
1694 else {
830247a4 1695 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1696 char, struct reg_data);
830247a4
IZ
1697 New(1208, RExC_rx->data->what, n, U8);
1698 RExC_rx->data->count = n;
c277df42 1699 }
830247a4
IZ
1700 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701 return RExC_rx->data->count - n;
c277df42
IZ
1702}
1703
d88dccdf 1704void
864dbfa3 1705Perl_reginitcolors(pTHX)
d88dccdf 1706{
d88dccdf
IZ
1707 int i = 0;
1708 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1709
d88dccdf
IZ
1710 if (s) {
1711 PL_colors[0] = s = savepv(s);
1712 while (++i < 6) {
1713 s = strchr(s, '\t');
1714 if (s) {
1715 *s = '\0';
1716 PL_colors[i] = ++s;
1717 }
1718 else
c712d376 1719 PL_colors[i] = s = "";
d88dccdf
IZ
1720 }
1721 } else {
b81d288d 1722 while (i < 6)
d88dccdf
IZ
1723 PL_colors[i++] = "";
1724 }
1725 PL_colorset = 1;
1726}
1727
8615cb43 1728
a687059c 1729/*
e50aee73 1730 - pregcomp - compile a regular expression into internal code
a687059c
LW
1731 *
1732 * We can't allocate space until we know how big the compiled form will be,
1733 * but we can't compile it (and thus know how big it is) until we've got a
1734 * place to put the code. So we cheat: we compile it twice, once with code
1735 * generation turned off and size counting turned on, and once "for real".
1736 * This also means that we don't allocate space until we are sure that the
1737 * thing really will compile successfully, and we never have to move the
1738 * code and thus invalidate pointers into it. (Note that it has to be in
1739 * one piece because free() must be able to free it all.) [NB: not true in perl]
1740 *
1741 * Beware that the optimization-preparation code in here knows about some
1742 * of the structure of the compiled regexp. [I'll say.]
1743 */
1744regexp *
864dbfa3 1745Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1746{
a0d0e21e 1747 register regexp *r;
c277df42 1748 regnode *scan;
c277df42 1749 regnode *first;
a0d0e21e 1750 I32 flags;
a0d0e21e
LW
1751 I32 minlen = 0;
1752 I32 sawplus = 0;
1753 I32 sawopen = 0;
2c2d71f5 1754 scan_data_t data;
830247a4
IZ
1755 RExC_state_t RExC_state;
1756 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1757
1758 if (exp == NULL)
c277df42 1759 FAIL("NULL regexp argument");
a0d0e21e 1760
a5961de5 1761 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1762
5cfc7842 1763 RExC_precomp = exp;
a5961de5
JH
1764 DEBUG_r({
1765 if (!PL_colorset) reginitcolors();
1766 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1767 PL_colors[4],PL_colors[5],PL_colors[0],
1768 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1769 });
e2509266 1770 RExC_flags = pm->op_pmflags;
830247a4 1771 RExC_sawback = 0;
bbce6d69 1772
830247a4
IZ
1773 RExC_seen = 0;
1774 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1775 RExC_seen_evals = 0;
1776 RExC_extralen = 0;
c277df42 1777
bbce6d69 1778 /* First pass: determine size, legality. */
830247a4 1779 RExC_parse = exp;
fac92740 1780 RExC_start = exp;
830247a4
IZ
1781 RExC_end = xend;
1782 RExC_naughty = 0;
1783 RExC_npar = 1;
1784 RExC_size = 0L;
1785 RExC_emit = &PL_regdummy;
1786 RExC_whilem_seen = 0;
85ddcde9
JH
1787#if 0 /* REGC() is (currently) a NOP at the first pass.
1788 * Clever compilers notice this and complain. --jhi */
830247a4 1789 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1790#endif
830247a4 1791 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1792 RExC_precomp = Nullch;
a0d0e21e
LW
1793 return(NULL);
1794 }
830247a4 1795 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1796
c277df42
IZ
1797 /* Small enough for pointer-storage convention?
1798 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1799 if (RExC_size >= 0x10000L && RExC_extralen)
1800 RExC_size += RExC_extralen;
c277df42 1801 else
830247a4
IZ
1802 RExC_extralen = 0;
1803 if (RExC_whilem_seen > 15)
1804 RExC_whilem_seen = 15;
a0d0e21e 1805
bbce6d69 1806 /* Allocate space and initialize. */
830247a4 1807 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1808 char, regexp);
a0d0e21e 1809 if (r == NULL)
b45f050a
JF
1810 FAIL("Regexp out of space");
1811
0f79a09d
GS
1812#ifdef DEBUGGING
1813 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1814 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1815#endif
c277df42 1816 r->refcnt = 1;
bbce6d69 1817 r->prelen = xend - exp;
5cfc7842 1818 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1819 r->subbeg = NULL;
ed252734
NC
1820#ifdef PERL_COPY_ON_WRITE
1821 r->saved_copy = Nullsv;
1822#endif
cf93c79d 1823 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1824 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1825
1826 r->substrs = 0; /* Useful during FAIL. */
1827 r->startp = 0; /* Useful during FAIL. */
1828 r->endp = 0; /* Useful during FAIL. */
1829
fac92740
MJD
1830 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1831 if (r->offsets) {
1832 r->offsets[0] = RExC_size;
1833 }
1834 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1835 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1836 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1837 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1838
830247a4 1839 RExC_rx = r;
bbce6d69
PP
1840
1841 /* Second pass: emit code. */
e2509266 1842 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1843 RExC_parse = exp;
1844 RExC_end = xend;
1845 RExC_naughty = 0;
1846 RExC_npar = 1;
fac92740 1847 RExC_emit_start = r->program;
830247a4 1848 RExC_emit = r->program;
2cd61cdb 1849 /* Store the count of eval-groups for security checks: */
eb160463 1850 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1851 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1852 r->data = 0;
830247a4 1853 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1854 return(NULL);
1855
1856 /* Dig out information for optimizations. */
cf93c79d 1857 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1858 pm->op_pmflags = RExC_flags;
a0ed51b3 1859 if (UTF)
5ff6fc6d 1860 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1861 r->regstclass = NULL;
830247a4 1862 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1863 r->reganch |= ROPT_NAUGHTY;
c277df42 1864 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1865
1866 /* XXXX To minimize changes to RE engine we always allocate
1867 3-units-long substrs field. */
1868 Newz(1004, r->substrs, 1, struct reg_substr_data);
1869
2c2d71f5 1870 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1871 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1872 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1873 I32 fake;
c5254dd6 1874 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1875 struct regnode_charclass_class ch_class;
1876 int stclass_flag;
cb434fcc 1877 I32 last_close = 0;
a0d0e21e
LW
1878
1879 first = scan;
c277df42 1880 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1881 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1882 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1883 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1884 (OP(first) == PLUS) ||
1885 (OP(first) == MINMOD) ||
653099ff 1886 /* An {n,m} with n>0 */
22c35a8c 1887 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1888 if (OP(first) == PLUS)
1889 sawplus = 1;
1890 else
1891 first += regarglen[(U8)OP(first)];
1892 first = NEXTOPER(first);
a687059c
LW
1893 }
1894
a0d0e21e
LW
1895 /* Starting-point info. */
1896 again:
653099ff 1897 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1898 if (OP(first) == EXACT)
1899 ; /* Empty, get anchored substr later. */
1900 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1901 r->regstclass = first;
1902 }
653099ff 1903 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1904 r->regstclass = first;
22c35a8c
GS
1905 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1906 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1907 r->regstclass = first;
22c35a8c 1908 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1909 r->reganch |= (OP(first) == MBOL
1910 ? ROPT_ANCH_MBOL
1911 : (OP(first) == SBOL
1912 ? ROPT_ANCH_SBOL
1913 : ROPT_ANCH_BOL));
a0d0e21e 1914 first = NEXTOPER(first);
774d564b
PP
1915 goto again;
1916 }
1917 else if (OP(first) == GPOS) {
1918 r->reganch |= ROPT_ANCH_GPOS;
1919 first = NEXTOPER(first);
1920 goto again;
a0d0e21e 1921 }
e09294f4 1922 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1923 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1924 !(r->reganch & ROPT_ANCH) )
1925 {
1926 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1927 int type = OP(NEXTOPER(first));
1928
ffc61ed2 1929 if (type == REG_ANY)
cad2e5aa
JH
1930 type = ROPT_ANCH_MBOL;
1931 else
1932 type = ROPT_ANCH_SBOL;
1933
1934 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1935 first = NEXTOPER(first);
774d564b 1936 goto again;
a0d0e21e 1937 }
b81d288d 1938 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1939 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1940 /* x+ must match at the 1st pos of run of x's */
1941 r->reganch |= ROPT_SKIP;
a0d0e21e 1942
c277df42 1943 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1944 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1945 (IV)(first - scan + 1)));
a0d0e21e
LW
1946 /*
1947 * If there's something expensive in the r.e., find the
1948 * longest literal string that must appear and make it the
1949 * regmust. Resolve ties in favor of later strings, since
1950 * the regstart check works with the beginning of the r.e.
1951 * and avoiding duplication strengthens checking. Not a
1952 * strong reason, but sufficient in the absence of others.
1953 * [Now we resolve ties in favor of the earlier string if
c277df42 1954 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1955 * earlier string may buy us something the later one won't.]
1956 */
a0d0e21e 1957 minlen = 0;
a687059c 1958
79cb57f6
GS
1959 data.longest_fixed = newSVpvn("",0);
1960 data.longest_float = newSVpvn("",0);
1961 data.last_found = newSVpvn("",0);
c277df42
IZ
1962 data.longest = &(data.longest_fixed);
1963 first = scan;
653099ff 1964 if (!r->regstclass) {
830247a4 1965 cl_init(pRExC_state, &ch_class);
653099ff
GS
1966 data.start_class = &ch_class;
1967 stclass_flag = SCF_DO_STCLASS_AND;
1968 } else /* XXXX Check for BOUND? */
1969 stclass_flag = 0;
cb434fcc 1970 data.last_closep = &last_close;
653099ff 1971
830247a4 1972 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1973 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1974 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1975 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1976 && !RExC_seen_zerolen
1977 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1978 r->reganch |= ROPT_CHECK_ALL;
830247a4 1979 scan_commit(pRExC_state, &data);
c277df42
IZ
1980 SvREFCNT_dec(data.last_found);
1981
a0ed51b3 1982 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1983 if (longest_float_length
c277df42
IZ
1984 || (data.flags & SF_FL_BEFORE_EOL
1985 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1986 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1987 int t;
1988
a0ed51b3 1989 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1990 && data.offset_fixed == data.offset_float_min
1991 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1992 goto remove_float; /* As in (a)+. */
1993
33b8afdf
JH
1994 if (SvUTF8(data.longest_float)) {
1995 r->float_utf8 = data.longest_float;
1996 r->float_substr = Nullsv;
1997 } else {
1998 r->float_substr = data.longest_float;
1999 r->float_utf8 = Nullsv;
2000 }
c277df42
IZ
2001 r->float_min_offset = data.offset_float_min;
2002 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
2003 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2004 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 2005 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2006 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2007 }
2008 else {
aca2d497 2009 remove_float:
33b8afdf 2010 r->float_substr = r->float_utf8 = Nullsv;
c277df42 2011 SvREFCNT_dec(data.longest_float);
c5254dd6 2012 longest_float_length = 0;
a0d0e21e 2013 }
c277df42 2014
a0ed51b3 2015 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2016 if (longest_fixed_length
c277df42
IZ
2017 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2018 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2019 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2020 int t;
2021
33b8afdf
JH
2022 if (SvUTF8(data.longest_fixed)) {
2023 r->anchored_utf8 = data.longest_fixed;
2024 r->anchored_substr = Nullsv;
2025 } else {
2026 r->anchored_substr = data.longest_fixed;
2027 r->anchored_utf8 = Nullsv;
2028 }
c277df42 2029 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2030 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2031 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2032 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2033 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2034 }
2035 else {
33b8afdf 2036 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2037 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2038 longest_fixed_length = 0;
a0d0e21e 2039 }
b81d288d 2040 if (r->regstclass
ffc61ed2 2041 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2042 r->regstclass = NULL;
33b8afdf
JH
2043 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2044 && stclass_flag
653099ff 2045 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2046 && !cl_is_anything(data.start_class))
2047 {
830247a4 2048 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2049
b81d288d 2050 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2051 struct regnode_charclass_class);
2052 StructCopy(data.start_class,
830247a4 2053 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2054 struct regnode_charclass_class);
830247a4 2055 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2056 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2057 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2058 DEBUG_r({ SV *sv = sv_newmortal();
2059 regprop(sv, (regnode*)data.start_class);
2060 PerlIO_printf(Perl_debug_log,
2061 "synthetic stclass `%s'.\n",
2062 SvPVX(sv));});
653099ff 2063 }
c277df42
IZ
2064
2065 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2066 if (longest_fixed_length > longest_float_length) {
c277df42 2067 r->check_substr = r->anchored_substr;
33b8afdf 2068 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2069 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2070 if (r->reganch & ROPT_ANCH_SINGLE)
2071 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2072 }
2073 else {
c277df42 2074 r->check_substr = r->float_substr;
33b8afdf 2075 r->check_utf8 = r->float_utf8;
c277df42
IZ
2076 r->check_offset_min = data.offset_float_min;
2077 r->check_offset_max = data.offset_float_max;
a0d0e21e 2078 }
30382c73
IZ
2079 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2080 This should be changed ASAP! */
33b8afdf 2081 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2082 r->reganch |= RE_USE_INTUIT;
33b8afdf 2083 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2084 r->reganch |= RE_INTUIT_TAIL;
2085 }
a0ed51b3
LW
2086 }
2087 else {
c277df42
IZ
2088 /* Several toplevels. Best we can is to set minlen. */
2089 I32 fake;
653099ff 2090 struct regnode_charclass_class ch_class;
cb434fcc 2091 I32 last_close = 0;
c277df42
IZ
2092
2093 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2094 scan = r->program + 1;
830247a4 2095 cl_init(pRExC_state, &ch_class);
653099ff 2096 data.start_class = &ch_class;
cb434fcc 2097 data.last_closep = &last_close;
e1901655 2098 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2099 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2100 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2101 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2102 && !cl_is_anything(data.start_class))
2103 {
830247a4 2104 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2105
b81d288d 2106 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2107 struct regnode_charclass_class);
2108 StructCopy(data.start_class,
830247a4 2109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2110 struct regnode_charclass_class);
830247a4 2111 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2113 DEBUG_r({ SV* sv = sv_newmortal();
2114 regprop(sv, (regnode*)data.start_class);
2115 PerlIO_printf(Perl_debug_log,
2116 "synthetic stclass `%s'.\n",
2117 SvPVX(sv));});
653099ff 2118 }
a0d0e21e
LW
2119 }
2120
a0d0e21e 2121 r->minlen = minlen;
b81d288d 2122 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2123 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2124 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2125 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2126 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2127 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2128 if (RExC_seen & REG_SEEN_CANY)
2129 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2130 Newz(1002, r->startp, RExC_npar, I32);
2131 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2132 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2133 DEBUG_r(regdump(r));
2134 return(r);
a687059c
LW
2135}
2136
2137/*
2138 - reg - regular expression, i.e. main body or parenthesized thing
2139 *
2140 * Caller must absorb opening parenthesis.
2141 *
2142 * Combining parenthesis handling with the base level of regular expression
2143 * is a trifle forced, but the need to tie the tails of the branches to what
2144 * follows makes it hard to avoid.
2145 */
76e3520e 2146STATIC regnode *
830247a4 2147S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2148 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2149{
c277df42
IZ
2150 register regnode *ret; /* Will be the head of the group. */
2151 register regnode *br;
2152 register regnode *lastbr;
2153 register regnode *ender = 0;
a0d0e21e 2154 register I32 parno = 0;
e2509266 2155 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2156
2157 /* for (?g), (?gc), and (?o) warnings; warning
2158 about (?c) will warn about (?g) -- japhy */
2159
2160 I32 wastedflags = 0x00,
2161 wasted_o = 0x01,
2162 wasted_g = 0x02,
2163 wasted_gc = 0x02 | 0x04,
2164 wasted_c = 0x04;
2165
fac92740 2166 char * parse_start = RExC_parse; /* MJD */
830247a4 2167 char *oregcomp_parse = RExC_parse;
c277df42 2168 char c;
a0d0e21e 2169
821b33a5 2170 *flagp = 0; /* Tentatively. */
a0d0e21e 2171
9d1d55b5 2172
a0d0e21e
LW
2173 /* Make an OPEN node, if parenthesized. */
2174 if (paren) {
fac92740 2175 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2176 U32 posflags = 0, negflags = 0;
2177 U32 *flagsp = &posflags;
0f5d15d6 2178 int logical = 0;
830247a4 2179 char *seqstart = RExC_parse;
ca9dfc88 2180
830247a4
IZ
2181 RExC_parse++;
2182 paren = *RExC_parse++;
c277df42 2183 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2184 switch (paren) {
fac92740 2185 case '<': /* (?<...) */
830247a4 2186 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2187 if (*RExC_parse == '!')
c277df42 2188 paren = ',';
b81d288d 2189 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2190 goto unknown;
830247a4 2191 RExC_parse++;
fac92740
MJD
2192 case '=': /* (?=...) */
2193 case '!': /* (?!...) */
830247a4 2194 RExC_seen_zerolen++;
fac92740
MJD
2195 case ':': /* (?:...) */
2196 case '>': /* (?>...) */
a0d0e21e 2197 break;
fac92740
MJD
2198 case '$': /* (?$...) */
2199 case '@': /* (?@...) */
8615cb43 2200 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2201 break;
fac92740 2202 case '#': /* (?#...) */
830247a4
IZ
2203 while (*RExC_parse && *RExC_parse != ')')
2204 RExC_parse++;
2205 if (*RExC_parse != ')')
c277df42 2206 FAIL("Sequence (?#... not terminated");
830247a4 2207 nextchar(pRExC_state);
a0d0e21e
LW
2208 *flagp = TRYAGAIN;
2209 return NULL;
fac92740 2210 case 'p': /* (?p...) */
9014280d 2211 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2212 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2213 /* FALL THROUGH*/
fac92740 2214 case '?': /* (??...) */
0f5d15d6 2215 logical = 1;
438a3801
YST
2216 if (*RExC_parse != '{')
2217 goto unknown;
830247a4 2218 paren = *RExC_parse++;
0f5d15d6 2219 /* FALL THROUGH */
fac92740 2220 case '{': /* (?{...}) */
c277df42 2221 {
c277df42
IZ
2222 I32 count = 1, n = 0;
2223 char c;
830247a4 2224 char *s = RExC_parse;
c277df42
IZ
2225 SV *sv;
2226 OP_4tree *sop, *rop;
2227
830247a4
IZ
2228 RExC_seen_zerolen++;
2229 RExC_seen |= REG_SEEN_EVAL;
2230 while (count && (c = *RExC_parse)) {
2231 if (c == '\\' && RExC_parse[1])
2232 RExC_parse++;
b81d288d 2233 else if (c == '{')
c277df42 2234 count++;
b81d288d 2235 else if (c == '}')
c277df42 2236 count--;
830247a4 2237 RExC_parse++;
c277df42 2238 }
830247a4 2239 if (*RExC_parse != ')')
b45f050a 2240 {
b81d288d 2241 RExC_parse = s;
b45f050a
JF
2242 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2243 }
c277df42 2244 if (!SIZE_ONLY) {
f3548bdc 2245 PAD *pad;
b81d288d
AB
2246
2247 if (RExC_parse - 1 - s)
830247a4 2248 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2249 else
79cb57f6 2250 sv = newSVpvn("", 0);
c277df42 2251
569233ed
SB
2252 ENTER;
2253 Perl_save_re_context(aTHX);
f3548bdc 2254 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2255 sop->op_private |= OPpREFCOUNTED;
2256 /* re_dup will OpREFCNT_inc */
2257 OpREFCNT_set(sop, 1);
569233ed 2258 LEAVE;
c277df42 2259
830247a4
IZ
2260 n = add_data(pRExC_state, 3, "nop");
2261 RExC_rx->data->data[n] = (void*)rop;
2262 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2263 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2264 SvREFCNT_dec(sv);
a0ed51b3 2265 }
e24b16f9 2266 else { /* First pass */
830247a4 2267 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 2268 && IN_PERL_RUNTIME)
2cd61cdb
IZ
2269 /* No compiled RE interpolated, has runtime
2270 components ===> unsafe. */
2271 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2272 if (PL_tainting && PL_tainted)
cc6b7395 2273 FAIL("Eval-group in insecure regular expression");
923e4eb5 2274 if (IN_PERL_COMPILETIME)
b5c19bd7 2275 PL_cv_has_eval = 1;
c277df42 2276 }
b5c19bd7 2277
830247a4 2278 nextchar(pRExC_state);
0f5d15d6 2279 if (logical) {
830247a4 2280 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2281 if (!SIZE_ONLY)
2282 ret->flags = 2;
830247a4 2283 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2284 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2285 return ret;
2286 }
ccb2c380
MP
2287 ret = reganode(pRExC_state, EVAL, n);
2288 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2289 Set_Node_Offset(ret, parse_start);
2290 return ret;
c277df42 2291 }
fac92740 2292 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2293 {
fac92740 2294 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2295 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2296 || RExC_parse[1] == '<'
830247a4 2297 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2298 I32 flag;
2299
830247a4 2300 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2301 if (!SIZE_ONLY)
2302 ret->flags = 1;
830247a4 2303 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2304 goto insert_if;
b81d288d 2305 }
a0ed51b3 2306 }
830247a4 2307 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2308 /* (?(1)...) */
830247a4 2309 parno = atoi(RExC_parse++);
c277df42 2310
830247a4
IZ
2311 while (isDIGIT(*RExC_parse))
2312 RExC_parse++;
fac92740
MJD
2313 ret = reganode(pRExC_state, GROUPP, parno);
2314
830247a4 2315 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2316 vFAIL("Switch condition not recognized");
c277df42 2317 insert_if:
830247a4
IZ
2318 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2319 br = regbranch(pRExC_state, &flags, 1);
c277df42 2320 if (br == NULL)
830247a4 2321 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2322 else
830247a4
IZ
2323 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2324 c = *nextchar(pRExC_state);
d1b80229
IZ
2325 if (flags&HASWIDTH)
2326 *flagp |= HASWIDTH;
c277df42 2327 if (c == '|') {
830247a4
IZ
2328 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2329 regbranch(pRExC_state, &flags, 1);
2330 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2331 if (flags&HASWIDTH)
2332 *flagp |= HASWIDTH;
830247a4 2333 c = *nextchar(pRExC_state);
a0ed51b3
LW
2334 }
2335 else
c277df42
IZ
2336 lastbr = NULL;
2337 if (c != ')')
8615cb43 2338 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2339 ender = reg_node(pRExC_state, TAIL);
2340 regtail(pRExC_state, br, ender);
c277df42 2341 if (lastbr) {
830247a4
IZ
2342 regtail(pRExC_state, lastbr, ender);
2343 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2344 }
2345 else
830247a4 2346 regtail(pRExC_state, ret, ender);
c277df42 2347 return ret;
a0ed51b3
LW
2348 }
2349 else {
830247a4 2350 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2351 }
2352 }
1b1626e4 2353 case 0:
830247a4 2354 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2355 vFAIL("Sequence (? incomplete");
1b1626e4 2356 break;
a0d0e21e 2357 default:
830247a4 2358 --RExC_parse;
fac92740 2359 parse_flags: /* (?i) */
830247a4 2360 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2361 /* (?g), (?gc) and (?o) are useless here
2362 and must be globally applied -- japhy */
2363
2364 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2365 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2366 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2367 if (! (wastedflags & wflagbit) ) {
2368 wastedflags |= wflagbit;
2369 vWARN5(
2370 RExC_parse + 1,
2371 "Useless (%s%c) - %suse /%c modifier",
2372 flagsp == &negflags ? "?-" : "?",
2373 *RExC_parse,
2374 flagsp == &negflags ? "don't " : "",
2375 *RExC_parse
2376 );
2377 }
2378 }
2379 }
2380 else if (*RExC_parse == 'c') {
2381 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2382 if (! (wastedflags & wasted_c) ) {
2383 wastedflags |= wasted_gc;
2384 vWARN3(
2385 RExC_parse + 1,
2386 "Useless (%sc) - %suse /gc modifier",
2387 flagsp == &negflags ? "?-" : "?",
2388 flagsp == &negflags ? "don't " : ""
2389 );
2390 }
2391 }
2392 }
2393 else { pmflag(flagsp, *RExC_parse); }
2394
830247a4 2395 ++RExC_parse;
ca9dfc88 2396 }
830247a4 2397 if (*RExC_parse == '-') {
ca9dfc88 2398 flagsp = &negflags;
9d1d55b5 2399 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2400 ++RExC_parse;
ca9dfc88 2401 goto parse_flags;
48c036b1 2402 }
e2509266
JH
2403 RExC_flags |= posflags;
2404 RExC_flags &= ~negflags;
830247a4
IZ
2405 if (*RExC_parse == ':') {
2406 RExC_parse++;
ca9dfc88
IZ
2407 paren = ':';
2408 break;
2409 }
c277df42 2410 unknown:
830247a4
IZ
2411 if (*RExC_parse != ')') {
2412 RExC_parse++;
2413 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2414 }
830247a4 2415 nextchar(pRExC_state);
a0d0e21e
LW
2416 *flagp = TRYAGAIN;
2417 return NULL;
2418 }
2419 }
fac92740 2420 else { /* (...) */
830247a4
IZ
2421 parno = RExC_npar;
2422 RExC_npar++;
2423 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2424 Set_Node_Length(ret, 1); /* MJD */
2425 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2426 open = 1;
a0d0e21e 2427 }
a0ed51b3 2428 }
fac92740 2429 else /* ! paren */
a0d0e21e
LW
2430 ret = NULL;
2431
2432 /* Pick up the branches, linking them together. */
fac92740 2433 parse_start = RExC_parse; /* MJD */
830247a4 2434 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2435 /* branch_len = (paren != 0); */
2436
a0d0e21e
LW
2437 if (br == NULL)
2438 return(NULL);
830247a4
IZ
2439 if (*RExC_parse == '|') {
2440 if (!SIZE_ONLY && RExC_extralen) {
2441 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2442 }
fac92740 2443 else { /* MJD */
830247a4 2444 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2445 Set_Node_Length(br, paren != 0);
2446 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2447 }
c277df42
IZ
2448 have_branch = 1;
2449 if (SIZE_ONLY)
830247a4 2450 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2451 }
2452 else if (paren == ':') {
c277df42
IZ
2453 *flagp |= flags&SIMPLE;
2454 }
2455 if (open) { /* Starts with OPEN. */
830247a4 2456 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2457 }
2458 else if (paren != '?') /* Not Conditional */
a0d0e21e 2459 ret = br;
32a0ca98 2460 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2461 lastbr = br;
830247a4
IZ
2462 while (*RExC_parse == '|') {
2463 if (!SIZE_ONLY && RExC_extralen) {
2464 ender = reganode(pRExC_state, LONGJMP,0);
2465 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2466 }
2467 if (SIZE_ONLY)
830247a4
IZ
2468 RExC_extralen += 2; /* Account for LONGJMP. */
2469 nextchar(pRExC_state);
2470 br = regbranch(pRExC_state, &flags, 0);
fac92740 2471
a687059c 2472 if (br == NULL)
a0d0e21e 2473 return(NULL);
830247a4 2474 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2475 lastbr = br;
821b33a5
IZ
2476 if (flags&HASWIDTH)
2477 *flagp |= HASWIDTH;
a687059c 2478 *flagp |= flags&SPSTART;
a0d0e21e
LW
2479 }
2480
c277df42
IZ
2481 if (have_branch || paren != ':') {
2482 /* Make a closing node, and hook it on the end. */
2483 switch (paren) {
2484 case ':':
830247a4 2485 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2486 break;
2487 case 1:
830247a4 2488 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2489 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2490 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2491 break;
2492 case '<':
c277df42
IZ
2493 case ',':
2494 case '=':
2495 case '!':
c277df42 2496 *flagp &= ~HASWIDTH;
821b33a5
IZ
2497 /* FALL THROUGH */
2498 case '>':
830247a4 2499 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2500 break;
2501 case 0:
830247a4 2502 ender = reg_node(pRExC_state, END);
c277df42
IZ
2503 break;
2504 }
830247a4 2505 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2506
c277df42
IZ
2507 if (have_branch) {
2508 /* Hook the tails of the branches to the closing node. */
2509 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2510 regoptail(pRExC_state, br, ender);
c277df42
IZ
2511 }
2512 }
a0d0e21e 2513 }
c277df42
IZ
2514
2515 {
2516 char *p;
2517 static char parens[] = "=!<,>";
2518
2519 if (paren && (p = strchr(parens, paren))) {
eb160463 2520 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2521 int flag = (p - parens) > 1;
2522
2523 if (paren == '>')
2524 node = SUSPEND, flag = 0;
830247a4 2525 reginsert(pRExC_state, node,ret);
45948336
EP
2526 Set_Node_Cur_Length(ret);
2527 Set_Node_Offset(ret, parse_start + 1);
c277df42 2528 ret->flags = flag;
830247a4 2529 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2530 }
a0d0e21e
LW
2531 }
2532
2533 /* Check for proper termination. */
ce3e6498 2534 if (paren) {
e2509266 2535 RExC_flags = oregflags;
830247a4
IZ
2536 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2537 RExC_parse = oregcomp_parse;
380a0633 2538 vFAIL("Unmatched (");
ce3e6498 2539 }
a0ed51b3 2540 }
830247a4
IZ
2541 else if (!paren && RExC_parse < RExC_end) {
2542 if (*RExC_parse == ')') {
2543 RExC_parse++;
380a0633 2544 vFAIL("Unmatched )");
a0ed51b3
LW
2545 }
2546 else
b45f050a 2547 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2548 /* NOTREACHED */
2549 }
a687059c 2550
a0d0e21e 2551 return(ret);
a687059c
LW
2552}
2553
2554/*
2555 - regbranch - one alternative of an | operator
2556 *
2557 * Implements the concatenation operator.
2558 */
76e3520e 2559STATIC regnode *
830247a4 2560S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2561{
c277df42
IZ
2562 register regnode *ret;
2563 register regnode *chain = NULL;
2564 register regnode *latest;
2565 I32 flags = 0, c = 0;
a0d0e21e 2566
b81d288d 2567 if (first)
c277df42
IZ
2568 ret = NULL;
2569 else {
b81d288d 2570 if (!SIZE_ONLY && RExC_extralen)
830247a4 2571 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2572 else {
830247a4 2573 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2574 Set_Node_Length(ret, 1);
2575 }
c277df42
IZ
2576 }
2577
b81d288d 2578 if (!first && SIZE_ONLY)
830247a4 2579 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2580
c277df42 2581 *flagp = WORST; /* Tentatively. */
a0d0e21e 2582
830247a4
IZ
2583 RExC_parse--;
2584 nextchar(pRExC_state);
2585 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2586 flags &= ~TRYAGAIN;
830247a4 2587 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2588 if (latest == NULL) {
2589 if (flags & TRYAGAIN)
2590 continue;
2591 return(NULL);
a0ed51b3
LW
2592 }
2593 else if (ret == NULL)
c277df42 2594 ret = latest;
a0d0e21e 2595 *flagp |= flags&HASWIDTH;
c277df42 2596 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2597 *flagp |= flags&SPSTART;
2598 else {
830247a4
IZ
2599 RExC_naughty++;
2600 regtail(pRExC_state, chain, latest);
a687059c 2601 }
a0d0e21e 2602 chain = latest;
c277df42
IZ
2603 c++;
2604 }
2605 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2606 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2607 if (ret == NULL)
2608 ret = chain;
2609 }
2610 if (c == 1) {
2611 *flagp |= flags&SIMPLE;
a0d0e21e 2612 }
a687059c 2613
a0d0e21e 2614 return(ret);
a687059c
LW
2615}
2616
2617/*
2618 - regpiece - something followed by possible [*+?]
2619 *
2620 * Note that the branching code sequences used for ? and the general cases
2621 * of * and + are somewhat optimized: they use the same NOTHING node as
2622 * both the endmarker for their branch list and the body of the last branch.
2623 * It might seem that this node could be dispensed with entirely, but the
2624 * endmarker role is not redundant.
2625 */
76e3520e 2626STATIC regnode *
830247a4 2627S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2628{
c277df42 2629 register regnode *ret;
a0d0e21e
LW
2630 register char op;
2631 register char *next;
2632 I32 flags;
830247a4 2633 char *origparse = RExC_parse;
a0d0e21e
LW
2634 char *maxpos;
2635 I32 min;
c277df42 2636 I32 max = REG_INFTY;
fac92740 2637 char *parse_start;
a0d0e21e 2638
830247a4 2639 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2640 if (ret == NULL) {
2641 if (flags & TRYAGAIN)
2642 *flagp |= TRYAGAIN;
2643 return(NULL);
2644 }
2645
830247a4 2646 op = *RExC_parse;
a0d0e21e 2647
830247a4 2648 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2649 parse_start = RExC_parse; /* MJD */
830247a4 2650 next = RExC_parse + 1;
a0d0e21e
LW
2651 maxpos = Nullch;
2652 while (isDIGIT(*next) || *next == ',') {
2653 if (*next == ',') {
2654 if (maxpos)
2655 break;
2656 else
2657 maxpos = next;
a687059c 2658 }
a0d0e21e
LW
2659 next++;
2660 }
2661 if (*next == '}') { /* got one */
2662 if (!maxpos)
2663 maxpos = next;
830247a4
IZ
2664 RExC_parse++;
2665 min = atoi(RExC_parse);
a0d0e21e
LW
2666 if (*maxpos == ',')
2667 maxpos++;
2668 else
830247a4 2669 maxpos = RExC_parse;
a0d0e21e
LW
2670 max = atoi(maxpos);
2671 if (!max && *maxpos != '0')
c277df42
IZ
2672 max = REG_INFTY; /* meaning "infinity" */
2673 else if (max >= REG_INFTY)
8615cb43 2674 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2675 RExC_parse = next;
2676 nextchar(pRExC_state);
a0d0e21e
LW
2677
2678 do_curly:
2679 if ((flags&SIMPLE)) {
830247a4
IZ
2680 RExC_naughty += 2 + RExC_naughty / 2;
2681 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2682 Set_Node_Offset(ret, parse_start+1); /* MJD */
2683 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2684 }
2685 else {
830247a4 2686 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2687
2688 w->flags = 0;
830247a4
IZ
2689 regtail(pRExC_state, ret, w);
2690 if (!SIZE_ONLY && RExC_extralen) {
2691 reginsert(pRExC_state, LONGJMP,ret);
2692 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2693 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2694 }
830247a4 2695 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2696 /* MJD hk */
2697 Set_Node_Offset(ret, parse_start+1);
2698 Set_Node_Length(ret,
2699 op == '{' ? (RExC_parse - parse_start) : 1);
2700
830247a4 2701 if (!SIZE_ONLY && RExC_extralen)
c277df42 2702 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2703 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2704 if (SIZE_ONLY)
830247a4
IZ
2705 RExC_whilem_seen++, RExC_extralen += 3;
2706 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2707 }
c277df42 2708 ret->flags = 0;
a0d0e21e
LW
2709
2710 if (min > 0)
821b33a5
IZ
2711 *flagp = WORST;
2712 if (max > 0)
2713 *flagp |= HASWIDTH;
a0d0e21e 2714 if (max && max < min)
8615cb43 2715 vFAIL("Can't do {n,m} with n > m");
c277df42 2716 if (!SIZE_ONLY) {
eb160463
GS
2717 ARG1_SET(ret, (U16)min);
2718 ARG2_SET(ret, (U16)max);
a687059c 2719 }
a687059c 2720
a0d0e21e 2721 goto nest_check;
a687059c 2722 }
a0d0e21e 2723 }
a687059c 2724
a0d0e21e
LW
2725 if (!ISMULT1(op)) {
2726 *flagp = flags;
a687059c 2727 return(ret);
a0d0e21e 2728 }
bb20fd44 2729
c277df42 2730#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2731
2732 /* if this is reinstated, don't forget to put this back into perldiag:
2733
2734 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2735
2736 (F) The part of the regexp subject to either the * or + quantifier
2737 could match an empty string. The {#} shows in the regular
2738 expression about where the problem was discovered.
2739
2740 */
2741
bb20fd44 2742 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2743 vFAIL("Regexp *+ operand could be empty");
b81d288d 2744#endif
bb20fd44 2745
fac92740 2746 parse_start = RExC_parse;
830247a4 2747 nextchar(pRExC_state);
a0d0e21e 2748
821b33a5 2749 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2750
2751 if (op == '*' && (flags&SIMPLE)) {
830247a4 2752 reginsert(pRExC_state, STAR, ret);
c277df42 2753 ret->flags = 0;
830247a4 2754 RExC_naughty += 4;
a0d0e21e
LW
2755 }
2756 else if (op == '*') {
2757 min = 0;
2758 goto do_curly;
a0ed51b3
LW
2759 }
2760 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2761 reginsert(pRExC_state, PLUS, ret);
c277df42 2762 ret->flags = 0;
830247a4 2763 RExC_naughty += 3;
a0d0e21e
LW
2764 }
2765 else if (op == '+') {
2766 min = 1;
2767 goto do_curly;
a0ed51b3
LW
2768 }
2769 else if (op == '?') {
a0d0e21e
LW
2770 min = 0; max = 1;
2771 goto do_curly;
2772 }
2773 nest_check:
e476b1b5 2774 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2775 vWARN3(RExC_parse,
b45f050a 2776 "%.*s matches null string many times",
830247a4 2777 RExC_parse - origparse,
b45f050a 2778 origparse);
a0d0e21e
LW
2779 }
2780
830247a4
IZ
2781 if (*RExC_parse == '?') {
2782 nextchar(pRExC_state);
2783 reginsert(pRExC_state, MINMOD, ret);
2784 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2785 }
830247a4
IZ
2786 if (ISMULT2(RExC_parse)) {
2787 RExC_parse++;
b45f050a
JF
2788 vFAIL("Nested quantifiers");
2789 }
a0d0e21e
LW
2790
2791 return(ret);
a687059c
LW
2792}
2793
2794/*
2795 - regatom - the lowest level
2796 *
2797 * Optimization: gobbles an entire sequence of ordinary characters so that
2798 * it can turn them into a single node, which is smaller to store and
2799 * faster to run. Backslashed characters are exceptions, each becoming a
2800 * separate node; the code is simpler that way and it's not worth fixing.
2801 *
b45f050a 2802 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2803STATIC regnode *
830247a4 2804S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2805{
c277df42 2806 register regnode *ret = 0;
a0d0e21e 2807 I32 flags;
45948336 2808 char *parse_start = RExC_parse;
a0d0e21e
LW
2809
2810 *flagp = WORST; /* Tentatively. */
2811
2812tryagain:
830247a4 2813 switch (*RExC_parse) {
a0d0e21e 2814 case '^':
830247a4
IZ
2815 RExC_seen_zerolen++;
2816 nextchar(pRExC_state);
e2509266 2817 if (RExC_flags & PMf_MULTILINE)
830247a4 2818 ret = reg_node(pRExC_state, MBOL);
e2509266 2819 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2820 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2821 else
830247a4 2822 ret = reg_node(pRExC_state, BOL);
fac92740 2823 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2824 break;
2825 case '$':
830247a4 2826 nextchar(pRExC_state);
b81d288d 2827 if (*RExC_parse)
830247a4 2828 RExC_seen_zerolen++;
e2509266 2829 if (RExC_flags & PMf_MULTILINE)
830247a4 2830 ret = reg_node(pRExC_state, MEOL);
e2509266 2831 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2832 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2833 else
830247a4 2834 ret = reg_node(pRExC_state, EOL);
fac92740 2835 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2836 break;
2837 case '.':
830247a4 2838 nextchar(pRExC_state);
e2509266 2839 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2840 ret = reg_node(pRExC_state, SANY);
2841 else
2842 ret = reg_node(pRExC_state, REG_ANY);
2843 *flagp |= HASWIDTH|SIMPLE;
830247a4 2844 RExC_naughty++;
fac92740 2845 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2846 break;
2847 case '[':
b45f050a 2848 {
830247a4 2849 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2850 ret = regclass(pRExC_state);
830247a4
IZ
2851 if (*RExC_parse != ']') {
2852 RExC_parse = oregcomp_parse;
b45f050a
JF
2853 vFAIL("Unmatched [");
2854 }
830247a4 2855 nextchar(pRExC_state);
a0d0e21e 2856 *flagp |= HASWIDTH|SIMPLE;
fac92740 2857 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2858 break;
b45f050a 2859 }
a0d0e21e 2860 case '(':
830247a4
IZ
2861 nextchar(pRExC_state);
2862 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2863 if (ret == NULL) {
bf93d4cc 2864 if (flags & TRYAGAIN) {
830247a4 2865 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2866 /* Make parent create an empty node if needed. */
2867 *flagp |= TRYAGAIN;
2868 return(NULL);
2869 }
a0d0e21e 2870 goto tryagain;
bf93d4cc 2871 }
a0d0e21e
LW
2872 return(NULL);
2873 }
c277df42 2874 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2875 break;
2876 case '|':
2877 case ')':
2878 if (flags & TRYAGAIN) {
2879 *flagp |= TRYAGAIN;
2880 return NULL;
2881 }
b45f050a 2882 vFAIL("Internal urp");
a0d0e21e
LW
2883 /* Supposed to be caught earlier. */
2884 break;
85afd4ae 2885 case '{':
830247a4
IZ
2886 if (!regcurly(RExC_parse)) {
2887 RExC_parse++;
85afd4ae
CS
2888 goto defchar;
2889 }
2890 /* FALL THROUGH */
a0d0e21e
LW
2891 case '?':
2892 case '+':
2893 case '*':
830247a4 2894 RExC_parse++;
b45f050a 2895 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2896 break;
2897 case '\\':
830247a4 2898 switch (*++RExC_parse) {
a0d0e21e 2899 case 'A':
830247a4
IZ
2900 RExC_seen_zerolen++;
2901 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2902 *flagp |= SIMPLE;
830247a4 2903 nextchar(pRExC_state);
fac92740 2904 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2905 break;
2906 case 'G':
830247a4
IZ
2907 ret = reg_node(pRExC_state, GPOS);
2908 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2909 *flagp |= SIMPLE;
830247a4 2910 nextchar(pRExC_state);
fac92740 2911 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2912 break;
2913 case 'Z':
830247a4 2914 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2915 *flagp |= SIMPLE;
a1917ab9 2916 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2917 nextchar(pRExC_state);
a0d0e21e 2918 break;
b85d18e9 2919 case 'z':
830247a4 2920 ret = reg_node(pRExC_state, EOS);
b85d18e9 2921 *flagp |= SIMPLE;
830247a4
IZ
2922 RExC_seen_zerolen++; /* Do not optimize RE away */
2923 nextchar(pRExC_state);
fac92740 2924 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2925 break;
4a2d328f 2926 case 'C':
f33976b4
DB
2927 ret = reg_node(pRExC_state, CANY);
2928 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2929 *flagp |= HASWIDTH|SIMPLE;
830247a4 2930 nextchar(pRExC_state);
fac92740 2931 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2932 break;
2933 case 'X':
830247a4 2934 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2935 *flagp |= HASWIDTH;
830247a4 2936 nextchar(pRExC_state);
fac92740 2937 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2938 break;
a0d0e21e 2939 case 'w':
eb160463 2940 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2941 *flagp |= HASWIDTH|SIMPLE;
830247a4 2942 nextchar(pRExC_state);
fac92740 2943 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2944 break;
2945 case 'W':
eb160463 2946 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2947 *flagp |= HASWIDTH|SIMPLE;
830247a4 2948 nextchar(pRExC_state);
fac92740 2949 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2950 break;
2951 case 'b':
830247a4
IZ
2952 RExC_seen_zerolen++;
2953 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2954 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2955 *flagp |= SIMPLE;
830247a4 2956 nextchar(pRExC_state);
fac92740 2957 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2958 break;
2959 case 'B':
830247a4
IZ
2960 RExC_seen_zerolen++;
2961 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2962 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2963 *flagp |= SIMPLE;
830247a4 2964 nextchar(pRExC_state);
fac92740 2965 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2966 break;
2967 case 's':
eb160463 2968 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2969 *flagp |= HASWIDTH|SIMPLE;
830247a4 2970 nextchar(pRExC_state);
fac92740 2971 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2972 break;
2973 case 'S':
eb160463 2974 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2975 *flagp |= HASWIDTH|SIMPLE;
830247a4 2976 nextchar(pRExC_state);
fac92740 2977 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2978 break;
2979 case 'd':
ffc61ed2 2980 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2981 *flagp |= HASWIDTH|SIMPLE;
830247a4 2982 nextchar(pRExC_state);
fac92740 2983 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2984 break;
2985 case 'D':
ffc61ed2 2986 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2987 *flagp |= HASWIDTH|SIMPLE;
830247a4 2988 nextchar(pRExC_state);
fac92740 2989 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2990 break;
a14b48bc
LW
2991 case 'p':
2992 case 'P':
3568d838 2993 {
830247a4 2994 char* oldregxend = RExC_end;
ccb2c380 2995 char* parse_start = RExC_parse - 2;
a14b48bc 2996
830247a4 2997 if (RExC_parse[1] == '{') {
3568d838 2998 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2999 RExC_end = strchr(RExC_parse, '}');
3000 if (!RExC_end) {
0da60cf5 3001 U8 c = (U8)*RExC_parse;
830247a4
IZ
3002 RExC_parse += 2;
3003 RExC_end = oldregxend;
0da60cf5 3004 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 3005 }
830247a4 3006 RExC_end++;
a14b48bc 3007 }
af6f566e 3008 else {
830247a4 3009 RExC_end = RExC_parse + 2;
af6f566e
HS
3010 if (RExC_end > oldregxend)
3011 RExC_end = oldregxend;
3012 }
830247a4 3013 RExC_parse--;
a14b48bc 3014
ffc61ed2 3015 ret = regclass(pRExC_state);
a14b48bc 3016
830247a4
IZ
3017 RExC_end = oldregxend;
3018 RExC_parse--;
ccb2c380
MP
3019
3020 Set_Node_Offset(ret, parse_start + 2);
3021 Set_Node_Cur_Length(ret);
830247a4 3022 nextchar(pRExC_state);
a14b48bc
LW
3023 *flagp |= HASWIDTH|SIMPLE;
3024 }
3025 break;
a0d0e21e
LW
3026 case 'n':
3027 case 'r':
3028 case 't':
3029 case 'f':
3030 case 'e':
3031 case 'a':
3032 case 'x':
3033 case 'c':
3034 case '0':
3035 goto defchar;
3036 case '1': case '2': case '3': case '4':
3037 case '5': case '6': case '7': case '8': case '9':
3038 {
830247a4 3039 I32 num = atoi(RExC_parse);
a0d0e21e 3040
830247a4 3041 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3042 goto defchar;
3043 else {
fac92740 3044 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3045 while (isDIGIT(*RExC_parse))
3046 RExC_parse++;
b45f050a 3047
eb160463 3048 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3049 vFAIL("Reference to nonexistent group");
830247a4 3050 RExC_sawback = 1;
eb160463
GS
3051 ret = reganode(pRExC_state,
3052 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3053 num);
a0d0e21e 3054 *flagp |= HASWIDTH;
fac92740
MJD
3055
3056 /* override incorrect value set in reganode MJD */
3057 Set_Node_Offset(ret, parse_start+1);
3058 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3059 RExC_parse--;
3060 nextchar(pRExC_state);
a0d0e21e
LW
3061 }
3062 }
3063 break;
3064 case '\0':
830247a4 3065 if (RExC_parse >= RExC_end)
b45f050a 3066 FAIL("Trailing \\");
a0d0e21e
LW
3067 /* FALL THROUGH */
3068 default:
c9f97d15
IZ
3069 /* Do not generate `unrecognized' warnings here, we fall
3070 back into the quick-grab loop below */
45948336 3071 parse_start--;
a0d0e21e
LW
3072 goto defchar;
3073 }
3074 break;
4633a7c4
LW
3075
3076 case '#':
e2509266 3077 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3078 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3079 if (RExC_parse < RExC_end)
4633a7c4
LW
3080 goto tryagain;
3081 }
3082 /* FALL THROUGH */
3083
a0d0e21e 3084 default: {
ba210ebe 3085 register STRLEN len;
58ae7d3f 3086 register UV ender;
a0d0e21e 3087 register char *p;
c277df42 3088 char *oldp, *s;
ba210ebe 3089 STRLEN numlen;
80aecb99 3090 STRLEN foldlen;
89ebb4a3 3091 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
3092
3093 parse_start = RExC_parse - 1;
a0d0e21e 3094
830247a4 3095 RExC_parse++;
a0d0e21e
LW
3096
3097 defchar:
58ae7d3f 3098 ender = 0;
eb160463
GS
3099 ret = reg_node(pRExC_state,
3100 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3101 s = STRING(ret);
830247a4
IZ
3102 for (len = 0, p = RExC_parse - 1;
3103 len < 127 && p < RExC_end;
a0d0e21e
LW
3104 len++)
3105 {
3106 oldp = p;
5b5a24f7 3107
e2509266 3108 if (RExC_flags & PMf_EXTENDED)
830247a4 3109 p = regwhite(p, RExC_end);
a0d0e21e
LW
3110 switch (*p) {
3111 case '^':
3112 case '$':
3113 case '.':
3114 case '[':
3115 case '(':
3116 case ')':
3117 case '|':
3118 goto loopdone;
3119 case '\\':
3120 switch (*++p) {
3121 case 'A':
1ed8eac0
JF
3122 case 'C':
3123 case 'X':
a0d0e21e
LW
3124 case 'G':
3125 case 'Z':
b85d18e9 3126 case 'z':
a0d0e21e
LW
3127 case 'w':
3128 case 'W':
3129 case 'b':
3130 case 'B':
3131 case 's':
3132 case 'S':
3133 case 'd':
3134 case 'D':
a14b48bc
LW
3135 case 'p':
3136 case 'P':
a0d0e21e
LW
3137 --p;
3138 goto loopdone;
3139 case 'n':
3140 ender = '\n';
3141 p++;
a687059c 3142 break;
a0d0e21e
LW
3143 case 'r':
3144 ender = '\r';
3145 p++;
a687059c 3146 break;
a0d0e21e
LW
3147 case 't':
3148 ender = '\t';
3149 p++;
a687059c 3150 break;
a0d0e21e
LW
3151 case 'f':
3152 ender = '\f';
3153 p++;
a687059c 3154 break;
a0d0e21e 3155 case 'e':
c7f1f016 3156 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3157 p++;
a687059c 3158 break;
a0d0e21e 3159 case 'a':
c7f1f016 3160 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3161 p++;
a687059c 3162 break;
a0d0e21e 3163 case 'x':
a0ed51b3
LW
3164 if (*++p == '{') {
3165 char* e = strchr(p, '}');
b81d288d 3166
b45f050a 3167 if (!e) {
830247a4 3168 RExC_parse = p + 1;
b45f050a
JF
3169 vFAIL("Missing right brace on \\x{}");
3170 }
de5f0749 3171 else {
a4c04bdc
NC
3172 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3173 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3174 numlen = e - p - 1;
3175 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3176 if (ender > 0xff)
3177 RExC_utf8 = 1;
a0ed51b3
LW
3178 p = e + 1;
3179 }
a0ed51b3
LW
3180 }
3181 else {
a4c04bdc 3182 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3183 numlen = 2;
3184 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3185 p += numlen;
3186 }
a687059c 3187 break;
a0d0e21e
LW
3188 case 'c':
3189 p++;
bbce6d69
PP
3190 ender = UCHARAT(p++);
3191 ender = toCTRL(ender);
a687059c 3192 break;
a0d0e21e
LW
3193 case '0': case '1': case '2': case '3':case '4':
3194 case '5': case '6': case '7': case '8':case '9':
3195 if (*p == '0' ||
830247a4 3196 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3197 I32 flags = 0;
3198 numlen = 3;
3199 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3200 p += numlen;
3201 }
3202 else {
3203 --p;
3204 goto loopdone;
a687059c
LW
3205 }
3206 break;
a0d0e21e 3207 case '\0':
830247a4 3208 if (p >= RExC_end)
b45f050a 3209 FAIL("Trailing \\");
a687059c 3210 /* FALL THROUGH */
a0d0e21e 3211 default:
e476b1b5 3212 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3213 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3214 goto normal_default;
a0d0e21e
LW
3215 }
3216 break;
a687059c 3217 default:
a0ed51b3 3218 normal_default:
fd400ab9 3219 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3220 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3221 &numlen, 0);
a0ed51b3
LW
3222 p += numlen;
3223 }
3224 else
3225 ender = *p++;
a0d0e21e 3226 break;
a687059c 3227 }
e2509266 3228 if (RExC_flags & PMf_EXTENDED)
830247a4 3229 p = regwhite(p, RExC_end);
60a8b682
JH
3230 if (UTF && FOLD) {
3231 /* Prime the casefolded buffer. */
ac7e0132 3232 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3233 }
a0d0e21e
LW
3234 if (ISMULT2(p)) { /* Back off on ?+*. */
3235 if (len)
3236 p = oldp;
16ea2a2e 3237 else if (UTF) {
0ebc6274
JH
3238 STRLEN unilen;
3239
80aecb99 3240 if (FOLD) {
60a8b682 3241 /* Emit all the Unicode characters. */
80aecb99
JH
3242 for (foldbuf = tmpbuf;
3243 foldlen;
3244 foldlen -= numlen) {
3245 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3246 if (numlen > 0) {
0ebc6274
JH
3247 reguni(pRExC_state, ender, s, &unilen);
3248 s += unilen;
3249 len += unilen;
3250 /* In EBCDIC the numlen
3251 * and unilen can differ. */
9dc45d57 3252 foldbuf += numlen;
47654450
JH
3253 if (numlen >= foldlen)
3254 break;
9dc45d57
JH
3255 }
3256 else
3257 break; /* "Can't happen." */
80aecb99
JH
3258 }
3259 }
3260 else {
0ebc6274 3261 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3262 if (unilen > 0) {
0ebc6274
JH
3263 s += unilen;
3264 len += unilen;
9dc45d57 3265 }
80aecb99 3266 }
a0ed51b3 3267 }
a0d0e21e
LW
3268 else {
3269 len++;
eb160463 3270 REGC((char)ender, s++);
a0d0e21e
LW
3271 }
3272 break;
a687059c 3273 }
16ea2a2e 3274 if (UTF) {
0ebc6274
JH
3275 STRLEN unilen;
3276
80aecb99 3277 if (FOLD) {
60a8b682 3278 /* Emit all the Unicode characters. */
80aecb99
JH
3279 for (foldbuf = tmpbuf;
3280 foldlen;
3281 foldlen -= numlen) {
3282 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3283 if (numlen > 0) {
0ebc6274
JH
3284 reguni(pRExC_state, ender, s, &unilen);
3285 len += unilen;
3286 s += unilen;
3287 /* In EBCDIC the numlen
3288 * and unilen can differ. */
9dc45d57 3289 foldbuf += numlen;
47654450
JH
3290 if (numlen >= foldlen)
3291 break;
9dc45d57
JH
3292 }
3293 else
3294 break;
80aecb99
JH
3295 }
3296 }
3297 else {
0ebc6274 3298 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3299 if (unilen > 0) {
0ebc6274
JH
3300 s += unilen;
3301 len += unilen;
9dc45d57 3302 }
80aecb99
JH
3303 }
3304 len--;
a0ed51b3
LW
3305 }
3306 else
eb160463 3307 REGC((char)ender, s++);
a0d0e21e
LW
3308 }
3309 loopdone:
830247a4 3310 RExC_parse = p - 1;
fac92740 3311 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3312 nextchar(pRExC_state);
793db0cb
JH
3313 {
3314 /* len is STRLEN which is unsigned, need to copy to signed */
3315 IV iv = len;
3316 if (iv < 0)
3317 vFAIL("Internal disaster");
3318 }
a0d0e21e
LW
3319 if (len > 0)
3320 *flagp |= HASWIDTH;
090f7165 3321 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 3322 *flagp |= SIMPLE;
c277df42 3323 if (!SIZE_ONLY)
cd439c50
IZ
3324 STR_LEN(ret) = len;
3325 if (SIZE_ONLY)
830247a4 3326 RExC_size += STR_SZ(len);
cd439c50 3327 else
830247a4 3328 RExC_emit += STR_SZ(len);
a687059c 3329 }
a0d0e21e
LW
3330 break;
3331 }
a687059c 3332
60a8b682
JH
3333 /* If the encoding pragma is in effect recode the text of
3334 * any EXACT-kind nodes. */
22c54be3 3335 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3336 STRLEN oldlen = STR_LEN(ret);
3337 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3338
3339 if (RExC_utf8)
3340 SvUTF8_on(sv);
3341 if (sv_utf8_downgrade(sv, TRUE)) {
3342 char *s = sv_recode_to_utf8(sv, PL_encoding);
3343 STRLEN newlen = SvCUR(sv);
3344
3345 if (SvUTF8(sv))
3346 RExC_utf8 = 1;
3347 if (!SIZE_ONLY) {
3348 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3349 (int)oldlen, STRING(ret),
3350 (int)newlen, s));
3351 Copy(s, STRING(ret), newlen, char);
3352 STR_LEN(ret) += newlen - oldlen;
3353 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3354 } else
3355 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3356 }
a72c7584
JH
3357 }
3358
a0d0e21e 3359 return(ret);
a687059c
LW
3360}
3361
873ef191 3362STATIC char *
cea2e8a9 3363S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3364{
3365 while (p < e) {
3366 if (isSPACE(*p))
3367 ++p;
3368 else if (*p == '#') {
3369 do {
3370 p++;
3371 } while (p < e && *p != '\n');
3372 }
3373 else
3374 break;
3375 }
3376 return p;
3377}
3378
b8c5462f
JH
3379/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3380 Character classes ([:foo:]) can also be negated ([:^foo:]).
3381 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3382 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3383 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3384
3385#define POSIXCC_DONE(c) ((c) == ':')
3386#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3387#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3388
b8c5462f 3389STATIC I32
830247a4 3390S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3391{
3392 char *posixcc = 0;
936ed897 3393 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3394
830247a4 3395 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3396 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3397 POSIXCC(UCHARAT(RExC_parse))) {
3398 char c = UCHARAT(RExC_parse);
830247a4 3399 char* s = RExC_parse++;
b81d288d 3400
9a86a77b 3401 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3402 RExC_parse++;
3403 if (RExC_parse == RExC_end)
620e46c5 3404 /* Grandfather lone [:, [=, [. */
830247a4 3405 RExC_parse = s;
620e46c5 3406 else {
830247a4 3407 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3408
80916619
NC
3409 assert(*t == c);
3410
9a86a77b 3411 if (UCHARAT(RExC_parse) == ']') {
830247a4 3412 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3413 posixcc = s + 1;
3414 if (*s == ':') {
3415 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
80916619
NC
3416 I32 skip = t - posixcc;
3417
3418 /* Initially switch on the length of the name. */
3419 switch (skip) {
3420 case 4:
3421 if (memEQ(posixcc, "word", 4)) {
3422 /* this is not POSIX, this is the Perl \w */;
3423 namedclass
3424 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3425 }
cc4319de 3426 break;
80916619
NC
3427 case 5:
3428 /* Names all of length 5. */
3429 /* alnum alpha ascii blank cntrl digit graph lower
3430 print punct space upper */
3431 /* Offset 4 gives the best switch position. */
3432 switch (posixcc[4]) {
3433 case 'a':
3434 if (memEQ(posixcc, "alph", 4)) {
3435 /* a */
3436 namedclass
3437 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3438 }
3439 break;
3440 case 'e':
3441 if (memEQ(posixcc, "spac", 4)) {
3442 /* e */
3443 namedclass
3444 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3445 }
3446 break;
3447 case 'h':
3448 if (memEQ(posixcc, "grap", 4)) {
3449 /* h */
3450 namedclass
3451 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3452 }
3453 break;
3454 case 'i':
3455 if (memEQ(posixcc, "asci", 4)) {
3456 /* i */
3457 namedclass
3458 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3459 }
3460 break;
3461 case 'k':
3462 if (memEQ(posixcc, "blan", 4)) {
3463 /* k */
3464 namedclass
3465 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3466 }
3467 break;
3468 case 'l':
3469 if (memEQ(posixcc, "cntr", 4)) {
3470 /* l */
3471 namedclass
3472 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3473 }
3474 break;
3475 case 'm':
3476 if (memEQ(posixcc, "alnu", 4)) {